| HN Mirror

#include <stdint.h> #pragma pack(4) struct block_addr { uint64_t low; uint32_t high; }; int sum(struct block_addr * a, struct block_addr * b, struct block_addr * c) { for (int i = 0; i < 8; ++i) { c->low = a->low + b->low; c++->high = a++->high + b++->high; } return 17; }

movq (%rbx), %rax addq (%r15), %rax movq %rax, (%r14) movl 8(%rbx), %eax addl 8(%r15), %eax movl %eax, 8(%r14) movq 12(%rbx), %rax addq 12(%r15), %rax movq %rax, 12(%r14) movl 20(%rbx), %eax addl 20(%r15), %eax movl %eax, 20(%r14) movq 24(%rbx), %rax addq 24(%r15), %rax movq %rax, 24(%r14) movl 32(%rbx), %eax addl 32(%r15), %eax movl %eax, 32(%r14) ... movq 84(%rbx), %rax addq 84(%r15), %rax movq %rax, 84(%r14) movl 92(%rbx), %eax addl 92(%r15), %eax movl %eax, 92(%r14)

Packed structs are possible, to be sure, but inhibit numerous optimizations, such as (relevant to this case) the use of vector instructions and vector registers.

Changing the loop to 4 iterations for compactness' sake, (aligned) structs of two u64s generate the following, vectorized code:

https://godbolt.org/g/jB4jki

  vmovdqu (%rsi), %xmm0
  vpaddq  (%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, (%rdx)
  vmovdqu 16(%rsi), %xmm0
  vpaddq  16(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 16(%rdx)
  vmovdqu 32(%rsi), %xmm0
  vpaddq  32(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 32(%rdx)
  vmovdqu 48(%rsi), %xmm0
  vpaddq  48(%rdi), %xmm0, %xmm0
  vmovdqu %xmm0, 48(%rdx)
  retq

And if the pointer arguments are declared `restrict`, the loop can be vectorized even more aggressively:

  vmovdqu64       (%rsi), %zmm0
  vpaddq  (%rdi), %zmm0, %zmm0
  vmovdqu64       %zmm0, (%rdx)
  vzeroupper
  retq

Either of which is much more efficient than the code generated for unaligned, packed 96-bit structs:

  movq    (%rsi), %rax
  addq    (%rdi), %rax
  movq    %rax, (%rdx)
  movl    8(%rsi), %eax
  addl    8(%rdi), %eax
  movl    %eax, 8(%rdx)
  movq    16(%rsi), %rax
  addq    16(%rdi), %rax
  movq    %rax, 16(%rdx)
  movl    24(%rsi), %eax
  addl    24(%rdi), %eax
  movl    %eax, 24(%rdx)
  movq    32(%rsi), %rax
  addq    32(%rdi), %rax
  movq    %rax, 32(%rdx)
  movl    40(%rsi), %eax
  addl    40(%rdi), %eax
  movl    %eax, 40(%rdx)
  movq    48(%rsi), %rax
  addq    48(%rdi), %rax
  movq    %rax, 48(%rdx)
  movl    56(%rsi), %eax
  addl    56(%rdi), %eax
  movl    %eax, 56(%rdx)
  retq

A smaller cost is that in non-vector code, using a 64-bit register (rax) in 32-bit mode (eax) is wasting half of the register.

IIRC, unaligned loads and stores will also, at the hardware level, stall the pipeline and inhibit out-of-order execution.