Hacker News new | ask | show | jobs
by nwallin 1073 days ago
> I assume that their test input (which isn't described in the post, and is also not in their GitHub repo) consists of random strings consisting almost entirely of s and p characters.

test code is here: https://github.com/414owen/blog-code/blob/master/02-the-same... it randomly selects between 's' or 'p'. The characters can't be anything other than 's', 'p', or the terminating null. Knowing that particular fact about our input gives us this ...clever... optimization:

    int run_switches(const char* s) {
        int result = 0;
        while (*s)
            result += (1 | *s++) - 'r';
        return result;
    }
which compiles to:

    run_switches:
            movzx   eax, BYTE PTR [rdi]
            xor     edx, edx
            test    al, al
            je      .L1
    .L3:
            or      eax, 1
            inc     rdi
            movsx   eax, al
            lea     edx, [rdx-114+rax]
            movzx   eax, BYTE PTR [rdi]
            test    al, al
            jne     .L3
    .L1:
            mov     eax, edx
            ret
This is too clever by half, of course, but it perfectly illustrates your point about exploiting properties of the data.
1 comments

You do not even need to subtract 'r'.

    int run_switches(const char* s) {
        int s_count = 0;
        const char *begin = s;
        while(*s) {
            s_count += (1 & *s++);
        }
        int count = s-begin;
        return count - s_count;
    }
which compiles to:

    .L49:
        and     edx, 1
        add     rax, 1
        add     ecx, edx
        movzx   edx, BYTE PTR [rax]
        test    dl, dl
        jne     .L49
edit: other variant

    int run_switches2(const char* s) {
        const char *begin = s;
        int sum = 0;
        while(*s) {
            sum += *s++;
        }
        int count = s-begin;
        int s_count = sum - ('s'*count)/('p'-'s');
        int p_count = count - s_count;
        return p_count - s_count;
   }
which compiles to:

   run_switches2(char const*):
        movsx   eax, BYTE PTR [rdi]
        test    al, al
        je      .L56
        mov     rdx, rdi
        xor     ecx, ecx
   .L55:
        add     rdx, 1
        add     ecx, eax
        movsx   eax, BYTE PTR [rdx]
        test    al, al
        jne     .L55
        sub     rdx, rdi
        imul    esi, edx, 115
        movsx   rax, esi
        sar     esi, 31
        imul    rax, rax, 1431655766
        shr     rax, 32
        sub     eax, esi
        add     ecx, eax
        sub     edx, ecx
        mov     eax, edx
        sub     eax, ecx
        ret
   .L56:
        xor     eax, eax
        ret
None of these will beat the clever blocked SIMD someone showed elsethread.