Hacker News new | ask | show | jobs
by smitherfield 3051 days ago
1. Accesses through pointers type-punned to something other than `(un(signed)) char` are undefined behavior.

  uint64_t n = 0xdeadbeef;

  uint32_t foo = (uint32_t)n; // OK

  uint32_t *bar = (uint32_t*)&n; // "OK" but useless
  foo = *bar; // undefined behavior!!!

  uint8_t *baz = (uint8_t*)&n;
  uint8_t byte = *baz; // OK, uint8_t is `unsigned char`

  // Same-size integral types are OK
  const volatile long long p = (const volatile long long*)&n;
  const volatile long long cvll = *p; // well-defined
2. Structs are aligned to the member with the strictest alignment requirement, so a struct of a `uint64_t` and a `uint32_t` will be aligned on an 8-byte boundary, meaning its size will be 128 bits.
1 comments

> Structs are aligned to the member with the strictest alignment requirement, so a struct of a `uint64_t` and a `uint32_t` will be aligned on an 8-byte boundary, meaning its size will be 128 bits.

Don't most C compilers support a pragma to control this? "#pragma pack(4)" for clang and gcc, I believe.

Given this (where I've made it add two arrays of 96-bit integers to make it easier to figure out the sizes in the assemply):

  #include <stdint.h>

  #pragma pack(4)
  struct block_addr {
      uint64_t low;
      uint32_t high;
  };

  int sum(struct block_addr * a, struct block_addr * b, struct block_addr * c)
  {
      for (int i = 0; i < 8; ++i)
      {
          c->low = a->low + b->low;
          c++->high = a++->high + b++->high;
      }
      return 17;
  }
here is the code for the loop body, which the compiler unrolled to make it even easier to see how the structure is laid out:

  movq    (%rbx), %rax
  addq    (%r15), %rax
  movq    %rax, (%r14)
  movl    8(%rbx), %eax
  addl    8(%r15), %eax
  movl    %eax, 8(%r14)
  
  movq    12(%rbx), %rax
  addq    12(%r15), %rax
  movq    %rax, 12(%r14)
  movl    20(%rbx), %eax
  addl    20(%r15), %eax
  movl    %eax, 20(%r14)
  
  movq    24(%rbx), %rax
  addq    24(%r15), %rax
  movq    %rax, 24(%r14)
  movl    32(%rbx), %eax
  addl    32(%r15), %eax
  movl    %eax, 32(%r14)
  
  ...
  
  movq    84(%rbx), %rax
  addq    84(%r15), %rax
  movq    %rax, 84(%r14)
  movl    92(%rbx), %eax
  addl    92(%r15), %eax
  movl    %eax, 92(%r14)
(Some white space added, and the middle cut out). The 96-bit inters are now only taking up 96-bits.
Packed structs are possible, to be sure, but inhibit numerous optimizations, such as (relevant to this case) the use of vector instructions and vector registers.

Changing the loop to 4 iterations for compactness' sake, (aligned) structs of two u64s generate the following, vectorized code:

https://godbolt.org/g/jB4jki

  vmovdqu (%rsi), %xmm0
  vpaddq  (%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, (%rdx)
  vmovdqu 16(%rsi), %xmm0
  vpaddq  16(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 16(%rdx)
  vmovdqu 32(%rsi), %xmm0
  vpaddq  32(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 32(%rdx)
  vmovdqu 48(%rsi), %xmm0
  vpaddq  48(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 48(%rdx)
  retq
And if the pointer arguments are declared `restrict`, the loop can be vectorized even more aggressively:

  vmovdqu64       (%rsi), %zmm0
  vpaddq  (%rdi), %zmm0, %zmm0
  vmovdqu64       %zmm0, (%rdx)
  vzeroupper
  retq
Either of which is much more efficient than the code generated for unaligned, packed 96-bit structs:

  movq    (%rsi), %rax
  addq    (%rdi), %rax
  movq    %rax, (%rdx)
  movl    8(%rsi), %eax
  addl    8(%rdi), %eax
  movl    %eax, 8(%rdx)
  movq    16(%rsi), %rax
  addq    16(%rdi), %rax
  movq    %rax, 16(%rdx)
  movl    24(%rsi), %eax
  addl    24(%rdi), %eax
  movl    %eax, 24(%rdx)
  movq    32(%rsi), %rax
  addq    32(%rdi), %rax
  movq    %rax, 32(%rdx)
  movl    40(%rsi), %eax
  addl    40(%rdi), %eax
  movl    %eax, 40(%rdx)
  movq    48(%rsi), %rax
  addq    48(%rdi), %rax
  movq    %rax, 48(%rdx)
  movl    56(%rsi), %eax
  addl    56(%rdi), %eax
  movl    %eax, 56(%rdx)
  retq
A smaller cost is that in non-vector code, using a 64-bit register (rax) in 32-bit mode (eax) is wasting half of the register.

IIRC, unaligned loads and stores will also, at the hardware level, stall the pipeline and inhibit out-of-order execution.

Oops, I used `#pragma pack` incorrectly in my code, but it doesn't change the codegen for the 96-bit structs other than offsets. Also `restrict` is only needed on the output argument to enable full vectorization of the 128-bit structs.

New link: https://godbolt.org/g/8uGn4h