| HN Mirror

Y	Hacker News new \| ask \| show \| jobs

by smitherfield 3051 days ago

1. Accesses through pointers type-punned to something other than `(un(signed)) char` are undefined behavior.

  uint64_t n = 0xdeadbeef;

  uint32_t foo = (uint32_t)n; // OK

  uint32_t *bar = (uint32_t*)&n; // "OK" but useless
  foo = *bar; // undefined behavior!!!

  uint8_t *baz = (uint8_t*)&n;
  uint8_t byte = *baz; // OK, uint8_t is `unsigned char`

  // Same-size integral types are OK
  const volatile long long p = (const volatile long long*)&n;
  const volatile long long cvll = *p; // well-defined

2. Structs are aligned to the member with the strictest alignment requirement, so a struct of a `uint64_t` and a `uint32_t` will be aligned on an 8-byte boundary, meaning its size will be 128 bits.

1 comments

tzs 3051 days ago

> Structs are aligned to the member with the strictest alignment requirement, so a struct of a `uint64_t` and a `uint32_t` will be aligned on an 8-byte boundary, meaning its size will be 128 bits.

Don't most C compilers support a pragma to control this? "#pragma pack(4)" for clang and gcc, I believe.

Given this (where I've made it add two arrays of 96-bit integers to make it easier to figure out the sizes in the assemply):

  #include <stdint.h>

  #pragma pack(4)
  struct block_addr {
      uint64_t low;
      uint32_t high;
  };

  int sum(struct block_addr * a, struct block_addr * b, struct block_addr * c)
  {
      for (int i = 0; i < 8; ++i)
      {
          c->low = a->low + b->low;
          c++->high = a++->high + b++->high;
      }
      return 17;
  }

here is the code for the loop body, which the compiler unrolled to make it even easier to see how the structure is laid out:

  movq    (%rbx), %rax
  addq    (%r15), %rax
  movq    %rax, (%r14)
  movl    8(%rbx), %eax
  addl    8(%r15), %eax
  movl    %eax, 8(%r14)
  
  movq    12(%rbx), %rax
  addq    12(%r15), %rax
  movq    %rax, 12(%r14)
  movl    20(%rbx), %eax
  addl    20(%r15), %eax
  movl    %eax, 20(%r14)
  
  movq    24(%rbx), %rax
  addq    24(%r15), %rax
  movq    %rax, 24(%r14)
  movl    32(%rbx), %eax
  addl    32(%r15), %eax
  movl    %eax, 32(%r14)
  
  ...
  
  movq    84(%rbx), %rax
  addq    84(%r15), %rax
  movq    %rax, 84(%r14)
  movl    92(%rbx), %eax
  addl    92(%r15), %eax
  movl    %eax, 92(%r14)

(Some white space added, and the middle cut out). The 96-bit inters are now only taking up 96-bits.

link

smitherfield 3050 days ago

Packed structs are possible, to be sure, but inhibit numerous optimizations, such as (relevant to this case) the use of vector instructions and vector registers.

Changing the loop to 4 iterations for compactness' sake, (aligned) structs of two u64s generate the following, vectorized code:

https://godbolt.org/g/jB4jki

  vmovdqu (%rsi), %xmm0
  vpaddq  (%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, (%rdx)
  vmovdqu 16(%rsi), %xmm0
  vpaddq  16(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 16(%rdx)
  vmovdqu 32(%rsi), %xmm0
  vpaddq  32(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 32(%rdx)
  vmovdqu 48(%rsi), %xmm0
  vpaddq  48(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 48(%rdx)
  retq

And if the pointer arguments are declared `restrict`, the loop can be vectorized even more aggressively:

  vmovdqu64       (%rsi), %zmm0
  vpaddq  (%rdi), %zmm0, %zmm0
  vmovdqu64       %zmm0, (%rdx)
  vzeroupper
  retq

Either of which is much more efficient than the code generated for unaligned, packed 96-bit structs:

  movq    (%rsi), %rax
  addq    (%rdi), %rax
  movq    %rax, (%rdx)
  movl    8(%rsi), %eax
  addl    8(%rdi), %eax
  movl    %eax, 8(%rdx)
  movq    16(%rsi), %rax
  addq    16(%rdi), %rax
  movq    %rax, 16(%rdx)
  movl    24(%rsi), %eax
  addl    24(%rdi), %eax
  movl    %eax, 24(%rdx)
  movq    32(%rsi), %rax
  addq    32(%rdi), %rax
  movq    %rax, 32(%rdx)
  movl    40(%rsi), %eax
  addl    40(%rdi), %eax
  movl    %eax, 40(%rdx)
  movq    48(%rsi), %rax
  addq    48(%rdi), %rax
  movq    %rax, 48(%rdx)
  movl    56(%rsi), %eax
  addl    56(%rdi), %eax
  movl    %eax, 56(%rdx)
  retq

A smaller cost is that in non-vector code, using a 64-bit register (rax) in 32-bit mode (eax) is wasting half of the register.

IIRC, unaligned loads and stores will also, at the hardware level, stall the pipeline and inhibit out-of-order execution.

link

smitherfield 3050 days ago

Oops, I used `#pragma pack` incorrectly in my code, but it doesn't change the codegen for the 96-bit structs other than offsets. Also `restrict` is only needed on the output argument to enable full vectorization of the 128-bit structs.

New link: https://godbolt.org/g/8uGn4h

link