Hacker News new | ask | show | jobs
by tzs 3049 days ago
Don't 64-bit CPUs usually have efficient instructions for operating on narrower values?

For instance, consider this C code for adding two 96-bit numbers on a 64-bit machine (ignoring carry for now):

  #include <stdint.h>

  extern void mark(void);

  int sum(uint64_t * a, uint64_t * b, uint64_t * c)
  {
      mark();
      *c++ = *a++ + *b++;
      mark();
      *(uint32_t *)c = *(uint32_t *)a + *(uint32_t *)b;
      mark();
      return 17;
  }
The purpose of the mark() function is to make it easier to see the code for the additions in the assembly output from the compiler. Here is what "cc -S -O3" (whatever cc comes with MacOS High Sierra) produces for my 64-bit Intel Core i5 for the parts that actually do the math:

  callq   _mark
  movq    (%rbx), %rax
  addq    (%r15), %rax
  movq    %rax, (%r14)
  callq   _mark
  movl    8(%rbx), %eax
  addl    8(%r15), %eax
  movl    %eax, 8(%r14)
  callq   _mark
I'm not too familiar with x86-64 assembly, but I am assuming that this could be made to handle carry by changing the "addl" to whatever the 32-bit version of adding with carry is.

Taking out the (uint32_t * ) casts to turn the C code from 96-bit adding into 128-bit adding generates assembly code that only differs in that both movl instruction become movq instructions, and addl becomes addq.

So, if you were writing in C it looks like a 96-bit add would be a little uglier than a 128-bit add because of the casts but isn't slower or bigger under the hood. But note that this is assuming accessing the 96-bit number as an array of variable sized parts. It's that assumption that introduces the need for ugly casts.

If a struct is used, then there is no need for casts:

  #include <stdint.h>

  typedef struct {
      uint64_t low;
      uint32_t high;
  } addr;

  extern void mark(void);

  int sum(addr * a, addr * b, addr * c)
  {
      mark();
      c->low = a->low + b->low;
      mark();
      c->high = a->high + b->high;
      mark();
      return 17;
  }
This generates the same code as the earlier version.

(I still have no idea how to handle the carry in C, or at least no idea that is not ridiculously inefficient. When I've implemented big integer libraries I've either used a type for my "digits" that is smaller than the native integer size so that I could detect a carry by a simple AND, or I've handled low level addition in assembly).

3 comments

1. Accesses through pointers type-punned to something other than `(un(signed)) char` are undefined behavior.

  uint64_t n = 0xdeadbeef;

  uint32_t foo = (uint32_t)n; // OK

  uint32_t *bar = (uint32_t*)&n; // "OK" but useless
  foo = *bar; // undefined behavior!!!

  uint8_t *baz = (uint8_t*)&n;
  uint8_t byte = *baz; // OK, uint8_t is `unsigned char`

  // Same-size integral types are OK
  const volatile long long p = (const volatile long long*)&n;
  const volatile long long cvll = *p; // well-defined
2. Structs are aligned to the member with the strictest alignment requirement, so a struct of a `uint64_t` and a `uint32_t` will be aligned on an 8-byte boundary, meaning its size will be 128 bits.
> Structs are aligned to the member with the strictest alignment requirement, so a struct of a `uint64_t` and a `uint32_t` will be aligned on an 8-byte boundary, meaning its size will be 128 bits.

Don't most C compilers support a pragma to control this? "#pragma pack(4)" for clang and gcc, I believe.

Given this (where I've made it add two arrays of 96-bit integers to make it easier to figure out the sizes in the assemply):

  #include <stdint.h>

  #pragma pack(4)
  struct block_addr {
      uint64_t low;
      uint32_t high;
  };

  int sum(struct block_addr * a, struct block_addr * b, struct block_addr * c)
  {
      for (int i = 0; i < 8; ++i)
      {
          c->low = a->low + b->low;
          c++->high = a++->high + b++->high;
      }
      return 17;
  }
here is the code for the loop body, which the compiler unrolled to make it even easier to see how the structure is laid out:

  movq    (%rbx), %rax
  addq    (%r15), %rax
  movq    %rax, (%r14)
  movl    8(%rbx), %eax
  addl    8(%r15), %eax
  movl    %eax, 8(%r14)
  
  movq    12(%rbx), %rax
  addq    12(%r15), %rax
  movq    %rax, 12(%r14)
  movl    20(%rbx), %eax
  addl    20(%r15), %eax
  movl    %eax, 20(%r14)
  
  movq    24(%rbx), %rax
  addq    24(%r15), %rax
  movq    %rax, 24(%r14)
  movl    32(%rbx), %eax
  addl    32(%r15), %eax
  movl    %eax, 32(%r14)
  
  ...
  
  movq    84(%rbx), %rax
  addq    84(%r15), %rax
  movq    %rax, 84(%r14)
  movl    92(%rbx), %eax
  addl    92(%r15), %eax
  movl    %eax, 92(%r14)
(Some white space added, and the middle cut out). The 96-bit inters are now only taking up 96-bits.
Packed structs are possible, to be sure, but inhibit numerous optimizations, such as (relevant to this case) the use of vector instructions and vector registers.

Changing the loop to 4 iterations for compactness' sake, (aligned) structs of two u64s generate the following, vectorized code:

https://godbolt.org/g/jB4jki

  vmovdqu (%rsi), %xmm0
  vpaddq  (%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, (%rdx)
  vmovdqu 16(%rsi), %xmm0
  vpaddq  16(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 16(%rdx)
  vmovdqu 32(%rsi), %xmm0
  vpaddq  32(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 32(%rdx)
  vmovdqu 48(%rsi), %xmm0
  vpaddq  48(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 48(%rdx)
  retq
And if the pointer arguments are declared `restrict`, the loop can be vectorized even more aggressively:

  vmovdqu64       (%rsi), %zmm0
  vpaddq  (%rdi), %zmm0, %zmm0
  vmovdqu64       %zmm0, (%rdx)
  vzeroupper
  retq
Either of which is much more efficient than the code generated for unaligned, packed 96-bit structs:

  movq    (%rsi), %rax
  addq    (%rdi), %rax
  movq    %rax, (%rdx)
  movl    8(%rsi), %eax
  addl    8(%rdi), %eax
  movl    %eax, 8(%rdx)
  movq    16(%rsi), %rax
  addq    16(%rdi), %rax
  movq    %rax, 16(%rdx)
  movl    24(%rsi), %eax
  addl    24(%rdi), %eax
  movl    %eax, 24(%rdx)
  movq    32(%rsi), %rax
  addq    32(%rdi), %rax
  movq    %rax, 32(%rdx)
  movl    40(%rsi), %eax
  addl    40(%rdi), %eax
  movl    %eax, 40(%rdx)
  movq    48(%rsi), %rax
  addq    48(%rdi), %rax
  movq    %rax, 48(%rdx)
  movl    56(%rsi), %eax
  addl    56(%rdi), %eax
  movl    %eax, 56(%rdx)
  retq
A smaller cost is that in non-vector code, using a 64-bit register (rax) in 32-bit mode (eax) is wasting half of the register.

IIRC, unaligned loads and stores will also, at the hardware level, stall the pipeline and inhibit out-of-order execution.

Oops, I used `#pragma pack` incorrectly in my code, but it doesn't change the codegen for the 96-bit structs other than offsets. Also `restrict` is only needed on the output argument to enable full vectorization of the 128-bit structs.

New link: https://godbolt.org/g/8uGn4h

See my little test program at https://godbolt.org/g/53SAMq

I believe this program properly handles carry from the low to high part.

The 96- and 128-bit code have the same number of instructions, but the 128-bit code has more instruction bytes due to "REX prefixes" (i.e., 32-bit register add is 3 bytes of opcode, 64-bit register add is 4)

Here's a better variant: https://godbolt.org/g/xRtr4i
nice
You can do it like this: https://godbolt.org/g/r6WruQ