|
|
|
|
|
by prirun
1172 days ago
|
|
Modern CPU caches are usually loaded in 64-byte units - much larger than 128 bits. I just ran some tests with a C program on an Intel I5 with both AoS and SoA using a list of 1B points with 32-bit X and Y components. Looping through the list of points and totaling all X and Y components was the same speed with either AoS or SoA. It's easy to make intuitive guesses about how things are working that seem completely reasonable. But you have to benchmark because modern CPUs are so complex that reasoning and intuition mostly don't work. Programs used for testing are below. I ran everything twice because my system wasn't always idle, so take the lower of the 2 runs. [jim@mbp ~]$ sh -x x
+ cat x1.c
#include <stdio.h>
#define NUM 1000000000
struct {
int x;
int y;
} p[NUM];
int main() {
int i,s;
for (i=0; i<NUM; i++) {
p[i].x = i;
p[i].y = i;
}
s=0;
for (i=0; i<NUM; i++) {
s += p[i].x + p[i].y;
}
printf("s=%d\n", s);
}
+ cc -o x1 x1.c
+ ./x1
s=1808348672
real 0m12.078s
user 0m7.319s
sys 0m4.363s
+ ./x1
s=1808348672
real 0m9.415s
user 0m6.677s
sys 0m2.685s
+ cat x2.c
#include <stdio.h>
#define NUM 1000000000
int x[NUM];
int y[NUM];
int main() {
int i,s;
for (i=0; i<NUM; i++) {
x[i] = i;
y[i] = i;
}
s=0;
for (i=0; i<NUM; i++) {
s += x[i] + y[i];
}
printf("s=%d\n", s);
}
+ cc -o x2 x2.c
+ ./x2
s=1808348672
real 0m9.753s
user 0m6.713s
sys 0m2.967s
+ ./x2
s=1808348672
real 0m9.642s
user 0m6.674s
sys 0m2.902s
+ cat x3.c
#include <stdio.h>
#define NUM 1000000000
struct {
int x;
int y;
} p[NUM];
int main() {
int i,s;
for (i=0; i<NUM; i++) {
p[i].x = i;
}
for (i=0; i<NUM; i++) {
p[i].y = i;
}
s=0;
for (i=0; i<NUM; i++) {
s += p[i].x;
}
for (i=0; i<NUM; i++) {
s += p[i].y;
}
printf("s=%d\n", s);
}
+ cc -o x3 x3.c
+ ./x3
s=1808348672
real 0m13.844s
user 0m11.095s
sys 0m2.700s
+ ./x3
s=1808348672
real 0m13.686s
user 0m11.038s
sys 0m2.611s
+ cat x4.c
#include <stdio.h>
#define NUM 1000000000
int x[NUM];
int y[NUM];
int main() {
int i,s;
for (i=0; i<NUM; i++)
x[i] = i;
for (i=0; i<NUM; i++)
y[i] = i;
s=0;
for (i=0; i<NUM; i++)
s += x[i];
for (i=0; i<NUM; i++)
s += y[i];
printf("s=%d\n", s);
}
+ cc -o x4 x4.c
+ ./x4
s=1808348672
real 0m13.530s
user 0m10.851s
sys 0m2.633s
+ ./x4
s=1808348672
real 0m13.489s
user 0m10.856s
sys 0m2.603s
|
|