-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathsimd_test.cpp
293 lines (265 loc) · 7.86 KB
/
simd_test.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
// SIMD particle update
#include "common.h"
#if __SSE__
#include <emmintrin.h>
#endif
#if __AVX__
#include <immintrin.h>
#endif
const int NUM_PARTICLES = 10000;
const int FRAMES_PER_SECOND = 60;
const int NUM_UPDATES = FRAMES_PER_SECOND * 10; // ten seconds of particle updates at 60fps;
const float UPDATE_DELTA = 1000.0f / FRAMES_PER_SECOND; // delta in ms
struct particle_buffer_AoS {
struct particle {
float x,y,z,vx,vy,vz,t;
};
particle *p;
float gravity;
particle_buffer_AoS() {
p = (particle*)malloc( sizeof(particle) * NUM_PARTICLES );
gravity = -9.81f;
for( int i = 0; i < NUM_PARTICLES; ++i ) {
p[i].x = (i%7)-3;
p[i].y = (i%11)-5;
p[i].z = (i%9)-4;
p[i].vx = 2.0f;
p[i].vy = 100.0f;
p[i].vz = 7.0f;
p[i].t = 0.0f;
}
}
~particle_buffer_AoS() {
free(p);
}
};
struct particle_buffer {
float *posx, *posy, *posz;
float *vx, *vy, *vz;
float gravity;
particle_buffer() {
posx = (float*)aligned_alloc( 32, sizeof(float) * NUM_PARTICLES );
posy = (float*)aligned_alloc( 32, sizeof(float) * NUM_PARTICLES );
posz = (float*)aligned_alloc( 32, sizeof(float) * NUM_PARTICLES );
vx = (float*)aligned_alloc( 32, sizeof(float) * NUM_PARTICLES );
vy = (float*)aligned_alloc( 32, sizeof(float) * NUM_PARTICLES );
vz = (float*)aligned_alloc( 32, sizeof(float) * NUM_PARTICLES );
gravity = -9.81f;
for( int i = 0; i < NUM_PARTICLES; ++i ) {
posx[i] = (i%7)-3;
posy[i] = (i%11)-5;
posz[i] = (i%9)-4;
vx[i] = 2.0f;
vy[i] = 100.0f;
vz[i] = 7.0f;
}
}
~particle_buffer() {
free(posx);
free(posy);
free(posz);
free(vx);
free(vy);
free(vz);
}
};
void SimpleUpdateParticlesAoS( particle_buffer_AoS *pb, float delta_time ) {
float g = pb->gravity;
float gd2 = g * delta_time * delta_time * 0.5f;
float gd = g * delta_time;
for( int i = 0; i < NUM_PARTICLES; ++i ) {
particle_buffer_AoS::particle *p = pb->p+i;
p->x += p->vx * delta_time;
p->y += p->vy * delta_time + gd2;
p->z += p->vz * delta_time;
p->vy += gd;
}
}
void SimpleUpdateParticles( particle_buffer *pb, float delta_time ) {
float g = pb->gravity;
float gd2 = g * delta_time * delta_time * 0.5f;
float gd = g * delta_time;
for( int i = 0; i < NUM_PARTICLES; ++i ) {
pb->posx[i] += pb->vx[i] * delta_time;
pb->posy[i] += pb->vy[i] * delta_time + gd2;
pb->posz[i] += pb->vz[i] * delta_time;
pb->vy[i] += gd;
}
}
void SliceUpdateParticles( particle_buffer *pb, float delta_time ) {
float g = pb->gravity;
float gd2 = g * delta_time * delta_time * 0.5f;
float gd = g * delta_time;
for( int i = 0; i < NUM_PARTICLES; ++i ) {
pb->posx[i] += pb->vx[i] * delta_time;
}
for( int i = 0; i < NUM_PARTICLES; ++i ) {
pb->posy[i] += pb->vy[i] * delta_time + gd2;
pb->vy[i] += gd;
}
for( int i = 0; i < NUM_PARTICLES; ++i ) {
pb->posz[i] += pb->vz[i] * delta_time;
}
}
#if __SSE__
void SIMD_SSE_UpdateParticles( particle_buffer *pb, float delta_time ) {
float g = pb->gravity;
float f_gd = g * delta_time;
float f_gd2 = pb->gravity * delta_time * delta_time * 0.5f;
// delta_time
__m128 mmd = _mm_setr_ps( delta_time, delta_time, delta_time, delta_time );
// gravity * delta_time
__m128 mmgd = _mm_load1_ps( &f_gd );
// gravity * delta_time * delta_time * 0.5f
__m128 mmgd2 = _mm_load1_ps( &f_gd2 );
__m128 *px = (__m128*)pb->posx;
__m128 *py = (__m128*)pb->posy;
__m128 *pz = (__m128*)pb->posz;
__m128 *vx = (__m128*)pb->vx;
__m128 *vy = (__m128*)pb->vy;
__m128 *vz = (__m128*)pb->vz;
int iterationCount = NUM_PARTICLES / 4;
for( int i = 0; i < iterationCount; ++i ) {
__m128 dx = _mm_mul_ps(vx[i], mmd );
__m128 dy = _mm_add_ps( _mm_mul_ps(vy[i], mmd ), mmgd2 );
__m128 dz = _mm_mul_ps(vz[i], mmd );
__m128 newx = _mm_add_ps(px[i], dx);
__m128 newy = _mm_add_ps(py[i], dy);
__m128 newz = _mm_add_ps(pz[i], dz);
__m128 newvy = _mm_add_ps(vy[i], mmgd);
_mm_store_ps((float*)(px+i), newx);
_mm_store_ps((float*)(py+i), newy);
_mm_store_ps((float*)(pz+i), newz);
_mm_store_ps((float*)(vy+i), newvy);
}
}
void SIMD_SSE_UpdateParticlesSliced( particle_buffer *pb, float delta_time ) {
float g = pb->gravity;
float f_gd = g * delta_time;
float f_gd2 = pb->gravity * delta_time * delta_time * 0.5f;
// delta_time
__m128 mmd = _mm_setr_ps( delta_time, delta_time, delta_time, delta_time );
// gravity * delta_time
__m128 mmgd = _mm_load1_ps( &f_gd );
// gravity * delta_time * delta_time * 0.5f
__m128 mmgd2 = _mm_load1_ps( &f_gd2 );
__m128 *px = (__m128*)pb->posx;
__m128 *py = (__m128*)pb->posy;
__m128 *pz = (__m128*)pb->posz;
__m128 *vx = (__m128*)pb->vx;
__m128 *vy = (__m128*)pb->vy;
__m128 *vz = (__m128*)pb->vz;
int iterationCount = NUM_PARTICLES / 4;
for( int i = 0; i < iterationCount; ++i ) {
__m128 dx = _mm_mul_ps(vx[i], mmd );
__m128 newx = _mm_add_ps(px[i], dx);
_mm_store_ps((float*)(px+i), newx);
}
for( int i = 0; i < iterationCount; ++i ) {
__m128 dy = _mm_add_ps( _mm_mul_ps(vy[i], mmd ), mmgd2 );
__m128 newy = _mm_add_ps(py[i], dy);
__m128 newvy = _mm_add_ps(vy[i], mmgd);
_mm_store_ps((float*)(py+i), newy);
_mm_store_ps((float*)(vy+i), newvy);
}
for( int i = 0; i < iterationCount; ++i ) {
__m128 dz = _mm_mul_ps(vz[i], mmd );
__m128 newz = _mm_add_ps(pz[i], dz);
_mm_store_ps((float*)(pz+i), newz);
}
}
#endif
#if __AVX__
void SIMD_AVX_UpdateParticles( particle_buffer *pb, float delta_time ) {
float g = pb->gravity;
float f_gd = g * delta_time;
float f_gd2 = pb->gravity * delta_time * delta_time * 0.5f;
// delta_time
__m256 mm256d = _mm256_set1_ps( delta_time );
// gravity * delta_time
__m256 mm256gd = _mm256_set1_ps( f_gd );
// gravity * delta_time * delta_time * 0.5f
__m256 mm256gd2 = _mm256_set1_ps( f_gd2 );
__m256 *px = (__m256*)pb->posx;
__m256 *py = (__m256*)pb->posy;
__m256 *pz = (__m256*)pb->posz;
__m256 *vx = (__m256*)pb->vx;
__m256 *vy = (__m256*)pb->vy;
__m256 *vz = (__m256*)pb->vz;
int iterationCount = NUM_PARTICLES / 8;
for( int i = 0; i < iterationCount; ++i ) {
__m256 dx = _mm256_mul_ps(vx[i], mm256d );
__m256 dy = _mm256_add_ps( _mm256_mul_ps(vy[i], mm256d ), mm256gd2 );
__m256 dz = _mm256_mul_ps(vz[i], mm256d );
__m256 newx = _mm256_add_ps(px[i], dx);
__m256 newy = _mm256_add_ps(py[i], dy);
__m256 newz = _mm256_add_ps(pz[i], dz);
__m256 newvy = _mm256_add_ps(vy[i], mm256gd);
_mm256_store_ps((float*)(px+i), newx);
_mm256_store_ps((float*)(py+i), newy);
_mm256_store_ps((float*)(pz+i), newz);
_mm256_store_ps((float*)(vy+i), newvy);
}
}
#endif
struct Data {
particle_buffer_AoS pbAoS;
particle_buffer pbsimple;
particle_buffer pbslice;
particle_buffer pbSIMDSSE;
particle_buffer pbSIMDAVX;
};
Data *gData;
#if __AVX__
void TestAVX() {
for( int frame = 0; frame < NUM_UPDATES; ++frame ) {
SIMD_AVX_UpdateParticles( &gData->pbSIMDAVX, UPDATE_DELTA);
}
}
#endif
#if __SSE__
void TestSSE() {
for( int frame = 0; frame < NUM_UPDATES; ++frame ) {
SIMD_SSE_UpdateParticles( &gData->pbSIMDSSE, UPDATE_DELTA);
}
}
void TestSSESliced() {
for( int frame = 0; frame < NUM_UPDATES; ++frame ) {
SIMD_SSE_UpdateParticlesSliced( &gData->pbSIMDSSE, UPDATE_DELTA);
}
}
#endif
void TestSoASliced() {
for( int frame = 0; frame < NUM_UPDATES; ++frame ) {
SliceUpdateParticles( &gData->pbslice, UPDATE_DELTA);
}
}
void TestSoA() {
for( int frame = 0; frame < NUM_UPDATES; ++frame ) {
SimpleUpdateParticles( &gData->pbsimple, UPDATE_DELTA);
}
}
void TestAoS() {
for( int frame = 0; frame < NUM_UPDATES; ++frame ) {
SimpleUpdateParticlesAoS( &gData->pbAoS, UPDATE_DELTA);
}
}
int main() {
Data data;
gData = &data;
Test tests[] = {
(Test){ TestAoS, "array of structs" },
(Test){ TestSoA, "struct of arrays naive processing" },
(Test){ TestSoASliced, "struct of arrays partitioned processing" },
#if __AVX__
(Test){ TestAVX, "AVX" },
#endif
#if __SSE__
(Test){ TestSSE, "SSE" },
(Test){ TestSSESliced, "SSE partitioned" },
#endif
};
printf("Trialling with %i particles over %i updates\n", NUM_PARTICLES, NUM_UPDATES );
RunTests( tests );
return 0;
}