|
Packed structs are possible, to be sure, but inhibit numerous optimizations, such as (relevant to this case) the use of vector instructions and vector registers. Changing the loop to 4 iterations for compactness' sake, (aligned) structs of two u64s generate the following, vectorized code: https://godbolt.org/g/jB4jki vmovdqu (%rsi), %xmm0
vpaddq (%rdi), %xmm0, %xmm0
vmovdqu %xmm0, (%rdx)
vmovdqu 16(%rsi), %xmm0
vpaddq 16(%rdi), %xmm0, %xmm0
vmovdqu %xmm0, 16(%rdx)
vmovdqu 32(%rsi), %xmm0
vpaddq 32(%rdi), %xmm0, %xmm0
vmovdqu %xmm0, 32(%rdx)
vmovdqu 48(%rsi), %xmm0
vpaddq 48(%rdi), %xmm0, %xmm0
vmovdqu %xmm0, 48(%rdx)
retq
And if the pointer arguments are declared `restrict`, the loop can be vectorized even more aggressively: vmovdqu64 (%rsi), %zmm0
vpaddq (%rdi), %zmm0, %zmm0
vmovdqu64 %zmm0, (%rdx)
vzeroupper
retq
Either of which is much more efficient than the code generated for unaligned, packed 96-bit structs: movq (%rsi), %rax
addq (%rdi), %rax
movq %rax, (%rdx)
movl 8(%rsi), %eax
addl 8(%rdi), %eax
movl %eax, 8(%rdx)
movq 16(%rsi), %rax
addq 16(%rdi), %rax
movq %rax, 16(%rdx)
movl 24(%rsi), %eax
addl 24(%rdi), %eax
movl %eax, 24(%rdx)
movq 32(%rsi), %rax
addq 32(%rdi), %rax
movq %rax, 32(%rdx)
movl 40(%rsi), %eax
addl 40(%rdi), %eax
movl %eax, 40(%rdx)
movq 48(%rsi), %rax
addq 48(%rdi), %rax
movq %rax, 48(%rdx)
movl 56(%rsi), %eax
addl 56(%rdi), %eax
movl %eax, 56(%rdx)
retq
A smaller cost is that in non-vector code, using a 64-bit register (rax) in 32-bit mode (eax) is wasting half of the register.IIRC, unaligned loads and stores will also, at the hardware level, stall the pipeline and inhibit out-of-order execution. |
New link: https://godbolt.org/g/8uGn4h