-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRasterizer_AVX.inl
480 lines (366 loc) · 15.5 KB
/
Rasterizer_AVX.inl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
#include "SoftMath.h"
static inline
void RasterizeTriangle_AVX(
const XVertex& v1, const XVertex& v2, const XVertex& v3,
const SoftRenderContext& context
)
{
mxPROFILE_SCOPE("Rasterize Triangle");
const int W = context.W; // viewport width
//const int H = context.H; // viewport height
const F4 fX1 = v1.P.x;
const F4 fX2 = v2.P.x;
const F4 fX3 = v3.P.x;
const F4 fY1 = v1.P.y;
const F4 fY2 = v2.P.y;
const F4 fY3 = v3.P.y;
const F4 fZ1 = v1.P.z;
const F4 fZ2 = v2.P.z;
const F4 fZ3 = v3.P.z;
// NOTE: these are actually inverses of W (because of perspective division 1/w in ProjectVertex(), after vertex shader).
const F4 fInvW1 = v1.P.w;
const F4 fInvW2 = v2.P.w;
const F4 fInvW3 = v3.P.w;
/* excerpt from "A Parallel Algorithm for Polygon Rasterization" by Juan Pineda
(Computer Graphics, Volume 22, Number 4, August 1988).
Typically in 3D graphics rendering, polygon vertices are in floating
point format after 3D transformation and projection. Some
implementations round the X and Y floating point ordinates to
integer values, so that simple integer line algorithms can be used
compute the triangle edges. This rounding can leave gaps on the
order of 1/2 pixel wide between adjacent polygons that do not share
common vertices. Gaps also occur as a result of the finite precision
used in specifying the endpoints, but these gaps are much narrower.
Some implementations attempt to eUminate these gaps by growing the
edges of triangles to insure overlap, but these solutions cause other
artifacts.
In order to minimize these artifacts, it is desirable to render triangle
edges as close as possible to the real line between two vertices. This
is conveniently done with this algorithm by performing the
interpolator setup computations in floating point, and converting to
fixed point at the end:
*/
// 28.4 fixed-point coordinates
// (12 bit resolution, enough for a 4096x4096 color buffer)
// (4 bits of sub-pixel precision, enough for a 2048x2048 color buffer)
//
const INT32 X1 = iround( 16.0f * fX1 );
const INT32 X2 = iround( 16.0f * fX2 );
const INT32 X3 = iround( 16.0f * fX3 );
const INT32 Y1 = iround( 16.0f * fY1 );
const INT32 Y2 = iround( 16.0f * fY2 );
const INT32 Y3 = iround( 16.0f * fY3 );
//const INT32 Z1 = iround( 16.0f * fZ1 );
//const INT32 Z2 = iround( 16.0f * fZ2 );
//const INT32 Z3 = iround( 16.0f * fZ3 );
//const INT32 W1 = iround( 16.0f * fInvW1 );
//const INT32 W2 = iround( 16.0f * fInvW2 );
//const INT32 W3 = iround( 16.0f * fInvW3 );
//
// deltas
const INT32 DeltaX12 = X1 - X2;
const INT32 DeltaX23 = X2 - X3;
const INT32 DeltaX31 = X3 - X1;
const INT32 DeltaY12 = Y1 - Y2;
const INT32 DeltaY23 = Y2 - Y3;
const INT32 DeltaY31 = Y3 - Y1;
//// back-face culling
//// calculate signed area of triangle in screen space
//// this is actually twice the area
//const int nArea = DeltaX12 * DeltaY31 - DeltaY12 * DeltaX31;
//(void)nArea;
//if( CULL_MODE == Cull_CW && nArea <= 0 )
//{
// return;
//}
//else if( CULL_MODE == Cull_CCW && nArea >= 0 )
//{
// return;
//}
// 24.8 Fixed-point deltas
const INT32 FDX12 = DeltaX12 << 4;
const INT32 FDX23 = DeltaX23 << 4;
const INT32 FDX31 = DeltaX31 << 4;
const INT32 FDY12 = DeltaY12 << 4;
const INT32 FDY23 = DeltaY23 << 4;
const INT32 FDY31 = DeltaY31 << 4;
// Compute interpolation data
//const F4 fDeltaX12 = fX1 - fX2;
//const F4 fDeltaX23 = fX2 - fX3;
const F4 fDeltaX31 = fX3 - fX1;
//const F4 fDeltaY12 = fY1 - fY2;
//const F4 fDeltaY23 = fY2 - fY3;
const F4 fDeltaY31 = fY3 - fY1;
const F4 fDeltaZ21 = fZ2 - fZ1;
const F4 fDeltaZ31 = fZ3 - fZ1;
// NOTE: these are actually inverses of W (because of perspective division 1/w in ProjectVertex(), after vertex shader).
const F4 fDeltaInvW21 = fInvW2 - fInvW1;
const F4 fDeltaInvW31 = fInvW3 - fInvW1;
const F4 fDeltaX21 = fX2 - fX1;
const F4 fDeltaY21 = fY2 - fY1;
const F4 INTERP_C = fDeltaX21 * fDeltaY31 - fDeltaX31 * fDeltaY21;
// compute gradient for interpolating depth (aka Z)
mxSIMDALIGNED Vec2D vZ;
ComputeGradient_FPU(
INTERP_C,
fDeltaZ21, fDeltaZ31,
fDeltaX21, fDeltaX31,
fDeltaY21, fDeltaY31,
vZ.x, vZ.y
);
// to calculate perspective correct u:
// 1. For every vertex: calculate u' = u / w and w' = 1 / w (which can be linearly interpolated in screen space)
// 2. For every pixel: linearly interpolate u' _and_ w' and calculate perspective correct u ( interpolated u' / interpolated w' ).
// When dividing a value through the w component it can be properly interpolated in image space.
// We can then recover the w component at any position we want through inversion.
// This is also why the pseudodepth for depth-testing is not affected by the aforementioned problem. It has already been divided by w during homogenization.
// Setup inverse W (for interpolating W)
mxSIMDALIGNED Vec2D vInvW;
ComputeGradient_FPU(
INTERP_C,
fDeltaInvW21, fDeltaInvW31,
fDeltaX21, fDeltaX31,
fDeltaY21, fDeltaY31,
vInvW.x, vInvW.y
);
// Cache perspective correct varyings for the first vertex (for interpolation)
FLOAT vars1[ NUM_VARYINGS ];
for( UINT i = 0; i < NUM_VARYINGS; i++ )
{
vars1[i] = v1.vars[i] * fInvW1;
}
// varyings multiplied by inverse W (they can be linearly interpolated in screen space)
mxSIMDALIGNED Vec2D varsOverW[ NUM_VARYINGS ];
// setup varyings for interpolation
for( UINT i = 0; i < NUM_VARYINGS; i++ )
{
const F4 v1v = v1.vars[i] * fInvW1;
const F4 v2v = v2.vars[i] * fInvW2;
const F4 v3v = v3.vars[i] * fInvW3;
ComputeGradient_FPU(
INTERP_C,
v2v - v1v, v3v - v1v,
fDeltaX21, fDeltaX31,
fDeltaY21, fDeltaY31,
varsOverW[i].x, varsOverW[i].y
);
}
// Block size, standard 8x8 (must be power of two)
// NOTE: should be 8, must be small enough so that accumulative errors don't grow too much
//enum { BLOCK_SIZE = 8 }; //enum { BLOCK_SIZE_LOG2 = 3 };
//enum { BLOCK_SIZE = 16 };
//enum { BLOCK_SIZE = 32 };
//enum { BLOCK_SIZE = 4 };
//enum { BLOCK_SIZE = 2 };
//
enum { BLOCK_SIZE_X = 8 };
enum { BLOCK_SIZE_Y = 4 };
// "A Parallel Algorithm for Polygon Rasterization":
// Since the edge function is linear, it is possible to compute the value
// of the edge function for a pixel an arbitrary distance L away from a
// given point (x,y) :
// Half-edge constants in 28.4
INT32 C1 = DeltaY12 * X1 - DeltaX12 * Y1;
INT32 C2 = DeltaY23 * X2 - DeltaX23 * Y2;
INT32 C3 = DeltaY31 * X3 - DeltaX31 * Y3;
// correct for top-left fill convention
if( DeltaY12 < 0 || (DeltaY12 == 0 && DeltaX12 > 0) ) {
++C1;
}
if( DeltaY23 < 0 || (DeltaY23 == 0 && DeltaX23 > 0) ) {
++C2;
}
if( DeltaY31 < 0 || (DeltaY31 == 0 && DeltaX31 > 0) ) {
++C3;
}
// Bounding rectangle of this triangle in screen space
// Adding 0xF and shifting right 4 bits is the equivalent of a ceil() function for the 28.4 fixed-point format.
// The first pixel to be filled is the one strictly to the right of the first edge,
// so that's why a ceil() operation is used (the fill convention for colorBuffer spot on the edge is taken care of a few lines lower).
#if 0
INT32 nMinX = Clamp( (Min3(X1, X2, X3) + 0xF) >> 4, 0, W );
INT32 nMaxX = Clamp( (Max3(X1, X2, X3) + 0xF) >> 4, 0, W );
INT32 nMinY = Clamp( (Min3(Y1, Y2, Y3) + 0xF) >> 4, 0, H );
INT32 nMaxY = Clamp( (Max3(Y1, Y2, Y3) + 0xF) >> 4, 0, H );
#else
INT32 nMinX = (Min3(X1, X2, X3) + 0xF) >> 4;
INT32 nMaxX = (Max3(X1, X2, X3) + 0xF) >> 4;
INT32 nMinY = (Min3(Y1, Y2, Y3) + 0xF) >> 4;
INT32 nMaxY = (Max3(Y1, Y2, Y3) + 0xF) >> 4;
#endif
//const INT32 iArea = (imaxx - iminx) * (imaxy - iminy);
//if( iArea == 0 ) {
// return;
//}
// Start in corner of 8x8 block
nMinX &= ~(BLOCK_SIZE_X - 1);
nMinY &= ~(BLOCK_SIZE_Y - 1);
//const UINT stride = W * sizeof SoftPixel; // aka 'pitch'
SoftPixel* startPixels = context.colorBuffer + nMinY * W; // color buffer
ZBufElem* startZBuffer = context.depthBuffer + nMinY * W; // depth buffer
SPixelShaderParameters pixelShaderArgs;
pixelShaderArgs.globals = context.globals;
for( UINT iBlockY = nMinY; iBlockY < nMaxY; iBlockY += BLOCK_SIZE_Y )
{
for( UINT iBlockX = nMinX; iBlockX < nMaxX; iBlockX += BLOCK_SIZE_X )
{
// Corners of block in 28.4 fixed-point (4 bits of sub-pixel accuracy)
const UINT FBlockX0 = (iBlockX << 4);
const UINT FBlockX1 = (iBlockX + (BLOCK_SIZE_X - 1)) << 4;
const UINT FBlockY0 = (iBlockY << 4);
const UINT FBlockY1 = (iBlockY + (BLOCK_SIZE_Y - 1)) << 4;
// Evaluate half-space functions in the 4 corners of the block
//OPTIMIZED: extract sign bit instead of converting to bool
#if 0
const bool a00 = (C1 + DeltaX12 * FBlockY0 - DeltaY12 * FBlockX0) < 0;
const bool a10 = (C1 + DeltaX12 * FBlockY0 - DeltaY12 * FBlockX1) < 0;
const bool a01 = (C1 + DeltaX12 * FBlockY1 - DeltaY12 * FBlockX0) < 0;
const bool a11 = (C1 + DeltaX12 * FBlockY1 - DeltaY12 * FBlockX1) < 0;
const UINT a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); // compose mask for all four corners
const bool b00 = (C2 + DeltaX23 * FBlockY0 - DeltaY23 * FBlockX0) < 0;
const bool b10 = (C2 + DeltaX23 * FBlockY0 - DeltaY23 * FBlockX1) < 0;
const bool b01 = (C2 + DeltaX23 * FBlockY1 - DeltaY23 * FBlockX0) < 0;
const bool b11 = (C2 + DeltaX23 * FBlockY1 - DeltaY23 * FBlockX1) < 0;
const UINT b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); // compose mask for all four corners
const bool c00 = (C3 + DeltaX31 * FBlockY0 - DeltaY31 * FBlockX0) < 0;
const bool c10 = (C3 + DeltaX31 * FBlockY0 - DeltaY31 * FBlockX1) < 0;
const bool c01 = (C3 + DeltaX31 * FBlockY1 - DeltaY31 * FBlockX0) < 0;
const bool c11 = (C3 + DeltaX31 * FBlockY1 - DeltaY31 * FBlockX1) < 0;
const UINT c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); // compose mask for all four corners
#else
const UINT a00 = INT32_SIGN_BIT_SET( C1 + DeltaX12 * FBlockY0 - DeltaY12 * FBlockX0 );
const UINT a10 = INT32_SIGN_BIT_SET( C1 + DeltaX12 * FBlockY0 - DeltaY12 * FBlockX1 );
const UINT a01 = INT32_SIGN_BIT_SET( C1 + DeltaX12 * FBlockY1 - DeltaY12 * FBlockX0 );
const UINT a11 = INT32_SIGN_BIT_SET( C1 + DeltaX12 * FBlockY1 - DeltaY12 * FBlockX1 );
const UINT a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3); // compose 4-bit mask for all four corners
const UINT b00 = INT32_SIGN_BIT_SET( C2 + DeltaX23 * FBlockY0 - DeltaY23 * FBlockX0 );
const UINT b10 = INT32_SIGN_BIT_SET( C2 + DeltaX23 * FBlockY0 - DeltaY23 * FBlockX1 );
const UINT b01 = INT32_SIGN_BIT_SET( C2 + DeltaX23 * FBlockY1 - DeltaY23 * FBlockX0 );
const UINT b11 = INT32_SIGN_BIT_SET( C2 + DeltaX23 * FBlockY1 - DeltaY23 * FBlockX1 );
const UINT b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3); // compose 4-bit mask for all four corners
const UINT c00 = INT32_SIGN_BIT_SET( C3 + DeltaX31 * FBlockY0 - DeltaY31 * FBlockX0 );
const UINT c10 = INT32_SIGN_BIT_SET( C3 + DeltaX31 * FBlockY0 - DeltaY31 * FBlockX1 );
const UINT c01 = INT32_SIGN_BIT_SET( C3 + DeltaX31 * FBlockY1 - DeltaY31 * FBlockX0 );
const UINT c11 = INT32_SIGN_BIT_SET( C3 + DeltaX31 * FBlockY1 - DeltaY31 * FBlockX1 );
const UINT c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3); // compose 4-bit mask for all four corners
#endif
// Skip block when outside an edge => outside the triangle
//OPTIMIZED: replaced logical 'or' (3 jmps) with bitwise ops and one comparison
#if 0
if( a == 0 || b == 0 || c == 0 ) {
continue;
}
#else
const UINT mask = ((~0U << 24) | (a << 16) | (b << 8) | (c));
#define has_nullbyte_(x) ((x - 0x01010101) & ~x & 0x80808080)
if( has_nullbyte_( mask ) )
{
continue;
}
#endif
SoftPixel* pixels = startPixels;
ZBufElem* zbuffer = startZBuffer;
// Accept whole block when totally covered (all four corners are inside triangle and their coverage masks are (1|2|4|8) = 15)
//OPTIMIZED: replaced 'and' with sum
#if 0
if( a == 0xF && b == 0xF && c == 0xF )
#elif 0
if( a + b + c == (0xF + 0xF + 0xF) )
#else
if( mask == 0xFF0F0F0F )
#endif
{
for( UINT iY = iBlockY; iY < iBlockY + BLOCK_SIZE_Y; iY++ )
{
for( UINT iX = iBlockX; iX < iBlockX + BLOCK_SIZE_X; iX++ )
{
const F4 fStartDX = (F4)iX - fX1; //<= conversion int => float is slow!
const F4 fStartDY = (F4)iY - fY1; //<= conversion int => float is slow!
// interpolate depth
const F4 Z = fZ1 + vZ.x * fStartDX + vZ.y * fStartDY;
const ZBufElem thisDepth = FloatDepthToZBufferValue( Z );
const ZBufElem prevDepth = zbuffer[ iX ];
if( thisDepth < prevDepth )
{
// depth test passed
zbuffer[ iX ] = thisDepth;
// compute inverse W
const F4 invW = fInvW1 + vInvW.x * fStartDX + vInvW.y * fStartDY;
// calculate perspectively-correct varyings
for( UINT i = 0; i < NUM_VARYINGS; i++ )
{
const F4 varOverW = vars1[i] + varsOverW[i].x * fStartDX + varsOverW[i].y * fStartDY;
pixelShaderArgs.vars[i] = varOverW / invW; // <= perspective correction via division
}
pixelShaderArgs.depth = Z;
pixelShaderArgs.pixel = pixels + iX;
// execute pixel shader
(*context.pixelShader)( pixelShaderArgs );
}//if depth test passed
}//for x
pixels += W;
zbuffer += W;
}//for y
SoftRenderer::Dbg_BlockRasterizer_DrawFullyCoveredRect( context, FBlockX0 >> 4, FBlockY0 >> 4, BLOCK_SIZE_X, BLOCK_SIZE_Y );
// (end of fully covered block)
}
else // Partially covered block
{
//Assert(!( a == 0xF && b == 0xF && c == 0xF ));
// in 28.4
INT32 CY1 = C1 + DeltaX12 * FBlockY0 - DeltaY12 * FBlockX0;
INT32 CY2 = C2 + DeltaX23 * FBlockY0 - DeltaY23 * FBlockX0;
INT32 CY3 = C3 + DeltaX31 * FBlockY0 - DeltaY31 * FBlockX0;
for( UINT iY = iBlockY; iY < iBlockY + BLOCK_SIZE_X; iY++ )
{
INT32 CX1 = CY1;
INT32 CX2 = CY2;
INT32 CX3 = CY3;
for( UINT iX = iBlockX; iX < iBlockX + BLOCK_SIZE_X; iX++ )
{
// no tiny dots and holes between polygons if we use less or equal
if( CX1 <= 0 && CX2 <= 0 && CX3 <= 0 )
//if( CX1 < 0 && CX2 < 0 && CX3 < 0 )
{
// inside triangle
const F4 fStartDX = (F4)iX - fX1; //<= conversion int => float is slow!
const F4 fStartDY = (F4)iY - fY1; //<= conversion int => float is slow!
// interpolate depth
const F4 Z = fZ1 + vZ.x * fStartDX + vZ.y * fStartDY;
const ZBufElem thisDepth = FloatDepthToZBufferValue( Z );
const ZBufElem prevDepth = zbuffer[ iX ];
if( thisDepth < prevDepth )
{
// depth test passed
zbuffer[ iX ] = thisDepth;
// compute inverse W
const F4 invW = fInvW1 + vInvW.x * fStartDX + vInvW.y * fStartDY;
// calculate perspectively-correct varyings
for( UINT i = 0; i < NUM_VARYINGS; i++ )
{
const F4 varOverW = vars1[i] + varsOverW[i].x * fStartDX + varsOverW[i].y * fStartDY;
pixelShaderArgs.vars[i] = varOverW / invW; // <= perspective correction via division
}
pixelShaderArgs.depth = Z;
pixelShaderArgs.pixel = pixels + iX;
// execute pixel shader
(*context.pixelShader)( pixelShaderArgs );
}
}//if depth test passed
CX1 -= FDY12;
CX2 -= FDY23;
CX3 -= FDY31;
}//for x
CY1 += FDX12;
CY2 += FDX23;
CY3 += FDX31;
pixels += W;
zbuffer += W;
}//for y
SoftRenderer::Dbg_BlockRasterizer_DrawPartiallyCoveredRect( context, FBlockX0 >> 4, FBlockY0 >> 4, BLOCK_SIZE_X, BLOCK_SIZE_Y );
}//Partially covered block
}//for each block on X axis
startPixels += W * BLOCK_SIZE_Y;
startZBuffer += W * BLOCK_SIZE_Y;
}//for each block on Y axis
}