-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathPatchFinder.cc
553 lines (486 loc) · 20.9 KB
/
PatchFinder.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
// Copyright 2008 Isis Innovation Limited
#include "PatchFinder.h"
#include "SmallMatrixOpts.h"
#include "KeyFrame.h"
#include <cvd/vision.h>
#include <cvd/vector_image_ref.h>
#include <cvd/image_interpolate.h>
#include <TooN/Cholesky.h>
// tmmintrin.h contains SSE3<> instrinsics, used for the ZMSSD search at the bottom..
// If this causes problems, just do #define CVD_HAVE_XMMINTRIN 0
#if CVD_HAVE_XMMINTRIN
#include <tmmintrin.h>
#endif
using namespace CVD;
using namespace std;
PatchFinder::PatchFinder(int nPatchSize)
: mimTemplate(ImageRef(nPatchSize,nPatchSize))
{
mnPatchSize = nPatchSize;
mirCenter = ImageRef(nPatchSize/2, nPatchSize/2);
int nMaxSSDPerPixel = 500; // Pretty arbitrary... could make a GVar out of this.
mnMaxSSD = mnPatchSize * mnPatchSize * nMaxSSDPerPixel;
// Populate the speed-up caches with bogus values:
mm2LastWarpMatrix = 9999.9 * Identity;
mpLastTemplateMapPoint = NULL;
};
// Find the warping matrix and search level
int PatchFinder::CalcSearchLevelAndWarpMatrix(MapPoint &p,
SE3<> se3CFromW,
Matrix<2> &m2CamDerivs)
{
// Calc point pos in new view camera frame
// Slightly dumb that we re-calculate this here when the tracker's already done this!
Vector<3> v3Cam = se3CFromW * p.v3WorldPos;
double dOneOverCameraZ = 1.0 / v3Cam[2];
// Project the source keyframe's one-pixel-right and one-pixel-down vectors into the current view
Vector<3> v3MotionRight = se3CFromW.get_rotation() * p.v3PixelRight_W;
Vector<3> v3MotionDown = se3CFromW.get_rotation() * p.v3PixelDown_W;
// Calculate in-image derivatives of source image pixel motions:
mm2WarpInverse.T()[0] = m2CamDerivs * (v3MotionRight.slice<0,2>() - v3Cam.slice<0,2>() * v3MotionRight[2] * dOneOverCameraZ) * dOneOverCameraZ;
mm2WarpInverse.T()[1] = m2CamDerivs * (v3MotionDown.slice<0,2>() - v3Cam.slice<0,2>() * v3MotionDown[2] * dOneOverCameraZ) * dOneOverCameraZ;
double dDet = mm2WarpInverse[0][0] * mm2WarpInverse[1][1] - mm2WarpInverse[0][1] * mm2WarpInverse[1][0];
mnSearchLevel = 0;
// This warp matrix is likely not appropriate for finding at level zero, which is
// the level at which it has been calculated. Vary the search level until the
// at that level would be appropriate (does not actually modify the matrix.)
while(dDet > 3 && mnSearchLevel < LEVELS-1)
{
mnSearchLevel++;
dDet *= 0.25;
};
// Some warps are inappropriate, e.g. too near the camera, too far, or reflected,
// or zero area.. reject these!
if(dDet > 3 || dDet < 0.25)
{
mbTemplateBad = true;
return -1;
}
else
return mnSearchLevel;
}
// This is just a convenience function wich caluclates the warp matrix and generates
// the template all in one call.
void PatchFinder::MakeTemplateCoarse(MapPoint &p,
SE3<> se3CFromW,
Matrix<2> &m2CamDerivs)
{
CalcSearchLevelAndWarpMatrix(p, se3CFromW, m2CamDerivs);
MakeTemplateCoarseCont(p);
};
// This function generates the warped search template.
void PatchFinder::MakeTemplateCoarseCont(MapPoint &p)
{
// Get the warping matrix appropriate for use with CVD::transform...
Matrix<2> m2 = M2Inverse(mm2WarpInverse) * LevelScale(mnSearchLevel);
// m2 now represents the number of pixels in the source image for one
// pixel of template image
// Optimisation: Don't re-gen the coarse template if it's going to be substantially the
// same as was made last time. This saves time when the camera is not moving. For this,
// check that (a) this patchfinder is still working on the same map point and (b) the
// warping matrix has not changed much.
bool bNeedToRefreshTemplate = false;
if(&p != mpLastTemplateMapPoint)
bNeedToRefreshTemplate = true;
// Still the same map point? Then compare warping matrix..
for(int i=0; !bNeedToRefreshTemplate && i<2; i++)
{
Vector<2> v2Diff = m2.T()[i] - mm2LastWarpMatrix.T()[i];
const double dRefreshLimit = 0.07; // Sort of works out as half a pixel displacement in src img
if(v2Diff * v2Diff > dRefreshLimit * dRefreshLimit)
bNeedToRefreshTemplate = true;
}
// Need to regen template? Then go ahead.
if(bNeedToRefreshTemplate)
{
int nOutside; // Use CVD::transform to warp the patch according the the warping matrix m2
// This returns the number of pixels outside the source image hit, which should be zero.
nOutside = CVD::transform(p.pPatchSourceKF->aLevels[p.nSourceLevel].im,
mimTemplate,
m2,
vec(p.irCenter),
vec(mirCenter));
if(nOutside)
mbTemplateBad = true;
else
mbTemplateBad = false;
MakeTemplateSums();
// Store the parameters which allow us to determine if we need to re-calculate
// the patch next time round.
mpLastTemplateMapPoint = &p;
mm2LastWarpMatrix = m2;
}
};
// This makes a template without warping. Used for epipolar search, where we don't really know
// what the warping matrix should be. (Although to be fair, I should do rotation for epipolar,
// which we could approximate without knowing patch depth!)
void PatchFinder::MakeTemplateCoarseNoWarp(KeyFrame &k, int nLevel, ImageRef irLevelPos)
{
mnSearchLevel = nLevel;
Image<byte> &im = k.aLevels[nLevel].im;
if(!im.in_image_with_border(irLevelPos, mnPatchSize / 2 + 1))
{
mbTemplateBad = true;
return;
}
mbTemplateBad = false;
copy(im,
mimTemplate,
mimTemplate.size(),
irLevelPos - mirCenter);
MakeTemplateSums();
}
// Convenient wrapper for the above
void PatchFinder::MakeTemplateCoarseNoWarp(MapPoint &p)
{
MakeTemplateCoarseNoWarp(*p.pPatchSourceKF, p.nSourceLevel, p.irCenter);
};
// Finds the sum, and sum-squared, of template pixels. These sums are used
// to calculate the ZMSSD.
inline void PatchFinder::MakeTemplateSums()
{
int nSum = 0;
int nSumSq = 0;
ImageRef ir;
do
{
int b = mimTemplate[ir];
nSum += b;
nSumSq +=b * b;
}
while(ir.next(mimTemplate.size()));
mnTemplateSum = nSum;
mnTemplateSumSq = nSumSq;
}
// One of the main functions of the class! Looks at the appropriate level of
// the target keyframe to try and find the template. Looks only at FAST corner points
// which are within radius nRange of the center. (Params are supplied in Level0
// coords.) Returns true on patch found.
bool PatchFinder::FindPatchCoarse(ImageRef irPos, KeyFrame &kf, unsigned int nRange)
{
mbFound = false;
// Convert from L0 coords to search level quantities
int nLevelScale = LevelScale(mnSearchLevel);
mirPredictedPos = irPos;
irPos = irPos / nLevelScale;
nRange = (nRange + nLevelScale - 1) / nLevelScale;
// Bounding box of search circle
int nTop = irPos.y - nRange;
int nBottomPlusOne = irPos.y + nRange + 1;
int nLeft = irPos.x - nRange;
int nRight = irPos.x + nRange;
// Ref variable for the search level
Level &L = kf.aLevels[mnSearchLevel];
// Some bounds checks on the bounding box..
if(nTop < 0)
nTop = 0;
if(nTop >= L.im.size().y)
return false;
if(nBottomPlusOne <= 0)
return false;
// The next section finds all the FAST corners in the target level which
// are near enough the search center. It's a bit optimised to use
// a corner row look-up-table, since otherwise the routine
// would spend a long time trawling throught the whole list of FAST corners!
vector<ImageRef>::iterator i;
vector<ImageRef>::iterator i_end;
i = L.vCorners.begin() + L.vCornerRowLUT[nTop];
if(nBottomPlusOne >= L.im.size().y)
i_end = L.vCorners.end();
else
i_end = L.vCorners.begin() + L.vCornerRowLUT[nBottomPlusOne];
ImageRef irBest; // Best match so far
int nBestSSD = mnMaxSSD + 1; // Best score so far is beyond the max allowed
for(; i<i_end; i++) // For each corner ...
{
if( i->x < nLeft || i->x > nRight)
continue;
if((irPos - *i).mag_squared() > nRange * nRange)
continue; // ... reject all those not close enough..
int nSSD; // .. and find the ZMSSD at those near enough.
nSSD = ZMSSDAtPoint(L.im, *i);
if(nSSD < nBestSSD) // Best yet?
{
irBest = *i;
nBestSSD = nSSD;
}
} // done looping over corners
if(nBestSSD < mnMaxSSD) // Found a valid match?
{
mv2CoarsePos= LevelZeroPos(irBest, mnSearchLevel);
mbFound = true;
}
else
mbFound = false;
return mbFound;
}
// Makes an inverse composition template out of the coarse template.
// Includes calculating image of derivatives (gradients.) The inverse composition
// used here operates on three variables: x offet, y offset, and difference in patch
// means; hence things like mm3HInv are dim 3, but the trivial mean jacobian
// (always unity, for each pixel) is not stored.
void PatchFinder::MakeSubPixTemplate()
{
mimJacs.resize(mimTemplate.size() - ImageRef(2,2));
Matrix<3> m3H = Zeros; // This stores jTj.
ImageRef ir;
for(ir.x = 1; ir.x < mnPatchSize - 1; ir.x++)
for(ir.y = 1; ir.y < mnPatchSize - 1; ir.y++)
{
Vector<2> v2Grad;
v2Grad[0] = 0.5 * (mimTemplate[ir + ImageRef(1,0)] - mimTemplate[ir - ImageRef(1,0)]);
v2Grad[1] = 0.5 * (mimTemplate[ir + ImageRef(0,1)] - mimTemplate[ir - ImageRef(0,1)]);
mimJacs[ir-ImageRef(1,1)].first = v2Grad[0];
mimJacs[ir-ImageRef(1,1)].second = v2Grad[1];
Vector<3> v3Grad = unproject(v2Grad); // This adds the mean-difference jacobian..
m3H += v3Grad.as_col() * v3Grad.as_row(); // Populate JTJ.
}
// Invert JTJ..
Cholesky<3> chol(m3H);
mm3HInv = chol.get_inverse();
// TOON2 Does not have a get_rank for cholesky
// int nRank = chol.get_rank();
// if(nRank < 3)
// cout << "BAD RANK IN MAKESUBPIXELTEMPLATE!!!!" << endl; // This does not happen often (almost never!)
mv2SubPixPos = mv2CoarsePos; // Start the sub-pixel search at the result of the coarse search..
mdMeanDiff = 0.0;
}
// Iterate inverse composition until convergence. Since it should never have
// to travel more than a pixel's distance, set a max number of iterations;
// if this is exceeded, consider the IC to have failed.
bool PatchFinder::IterateSubPixToConvergence(KeyFrame &kf, int nMaxIts)
{
const double dConvLimit = 0.03;
bool bConverged = false;
int nIts;
for(nIts = 0; nIts < nMaxIts && !bConverged; nIts++)
{
double dUpdateSquared = IterateSubPix(kf);
if(dUpdateSquared < 0) // went off edge of image
return false;
if(dUpdateSquared < dConvLimit*dConvLimit)
return true;
}
return false;
}
// Single iteration of inverse composition. This compares integral image positions in the
// template image to floating point positions in the target keyframe. Interpolation is
// bilinear, and performed manually (rather than using CVD::image_interpolate) since
// this is a special case where the mixing fractions for each pixel are identical.
double PatchFinder::IterateSubPix(KeyFrame &kf)
{
// Search level pos of patch center
Vector<2> v2Center = LevelNPos(mv2SubPixPos, mnSearchLevel);
BasicImage<byte> &im = kf.aLevels[mnSearchLevel].im;
if(!im.in_image_with_border(ir_rounded(v2Center), mnPatchSize / 2 + 1))
return -1.0; // Negative return value indicates off edge of image
// Position of top-left corner of patch in search level
Vector<2> v2Base = v2Center - vec(mirCenter);
// I.C. JT*d accumulator
Vector<3> v3Accum = Zeros;
ImageRef ir;
byte* pTopLeftPixel;
// Each template pixel will be compared to an interpolated target pixel
// The target value is made using bilinear interpolation as the weighted sum
// of four target image pixels. Calculate mixing fractions:
double dX = v2Base[0]-floor(v2Base[0]); // Distances from pixel center of TL pixel
double dY = v2Base[1]-floor(v2Base[1]);
float fMixTL = (1.0 - dX) * (1.0 - dY);
float fMixTR = (dX) * (1.0 - dY);
float fMixBL = (1.0 - dX) * (dY);
float fMixBR = (dX) * (dY);
// Loop over template image
unsigned long nRowOffset = &kf.aLevels[mnSearchLevel].im[ImageRef(0,1)] - &kf.aLevels[mnSearchLevel].im[ImageRef(0,0)];
for(ir.y = 1; ir.y < mnPatchSize - 1; ir.y++)
{
pTopLeftPixel = &im[::ir(v2Base) + ImageRef(1,ir.y)]; // n.b. the x=1 offset, as with y
for(ir.x = 1; ir.x < mnPatchSize - 1; ir.x++)
{
float fPixel = // Calc target interpolated pixel
fMixTL * pTopLeftPixel[0] + fMixTR * pTopLeftPixel[1] +
fMixBL * pTopLeftPixel[nRowOffset] + fMixBR * pTopLeftPixel[nRowOffset + 1];
pTopLeftPixel++;
double dDiff = fPixel - mimTemplate[ir] + mdMeanDiff;
v3Accum[0] += dDiff * mimJacs[ir - ImageRef(1,1)].first;
v3Accum[1] += dDiff * mimJacs[ir - ImageRef(1,1)].second;
v3Accum[2] += dDiff; // Update JT*d
};
}
// All done looping over image - find JTJ^-1 * JTd:
Vector<3> v3Update = mm3HInv * v3Accum;
mv2SubPixPos -= v3Update.slice<0,2>() * LevelScale(mnSearchLevel);
mdMeanDiff -= v3Update[2];
double dPixelUpdateSquared = v3Update.slice<0,2>() * v3Update.slice<0,2>();
return dPixelUpdateSquared;
}
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
//
//
// ZMSSDatpoint, which is SSE optimised, follows
//
// The top version is the SSE version for 8x8 patches. It is compiled
// only if CVD_HAVE_XMMINTRIN is true, also you need to give your
// compiler the appropriate flags (e.g. -march=core2 -msse3 for g++.)
// The standard c++ version, which is about half as quick (not a disaster
// by any means) is below.
//
// The 8x8 SSE version looks long because it has been unrolled,
// it just does the same thing eight times. Both versions are one-pass
// and need pre-calculated template sums and sum-squares.
//
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
#if CVD_HAVE_XMMINTRIN
// Horizontal sum of uint16s stored in an XMM register
inline int SumXMM_16(__m128i &target)
{
unsigned short int sums_store[8];
_mm_storeu_si128((__m128i*)sums_store, target);
return sums_store[0] + sums_store[1] + sums_store[2] + sums_store[3] +
sums_store[4] + sums_store[5] + sums_store[6] + sums_store[7];
}
// Horizontal sum of uint32s stored in an XMM register
inline int SumXMM_32(__m128i &target)
{
unsigned int sums_store[4];
_mm_storeu_si128((__m128i*)sums_store, target);
return sums_store[0] + sums_store[1] + sums_store[2] + sums_store[3];
}
#endif
// Calculate the Zero-mean SSD of the coarse patch and a target imate at a specific
// point.
int PatchFinder::ZMSSDAtPoint(CVD::BasicImage<CVD::byte> &im, const CVD::ImageRef &ir)
{
if(!im.in_image_with_border(ir, mirCenter[0]))
return mnMaxSSD + 1;
ImageRef irImgBase = ir - mirCenter;
byte *imagepointer;
byte *templatepointer;
int nImageSumSq = 0;
int nImageSum = 0;
int nCrossSum = 0;
#if CVD_HAVE_XMMINTRIN
if(mnPatchSize == 8)
{
long unsigned int imagepointerincrement;
__m128i xImageAsEightBytes;
__m128i xImageAsWords;
__m128i xTemplateAsEightBytes;
__m128i xTemplateAsWords;
__m128i xZero;
__m128i xImageSums; // These sums are 8xuint16
__m128i xImageSqSums; // These sums are 4xint32
__m128i xCrossSums; // These sums are 4xint32
__m128i xProduct;
xImageSums = _mm_setzero_si128();
xImageSqSums = _mm_setzero_si128();
xCrossSums = _mm_setzero_si128();
xZero = _mm_setzero_si128();
imagepointer = &im[irImgBase + ImageRef(0,0)];
templatepointer = &mimTemplate[ImageRef(0,0)];
imagepointerincrement = &im[irImgBase + ImageRef(0,1)] - imagepointer;
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
templatepointer += 16;
xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
templatepointer += 16;
xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
templatepointer += 16;
xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
imagepointer += imagepointerincrement;
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsEightBytes=_mm_load_si128((__m128i*) templatepointer);
templatepointer += 16;
xTemplateAsWords = _mm_unpacklo_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
xImageAsEightBytes=_mm_loadl_epi64((__m128i*) imagepointer);
xImageAsWords = _mm_unpacklo_epi8(xImageAsEightBytes,xZero);
xImageSums = _mm_adds_epu16(xImageAsWords,xImageSums);
xProduct = _mm_madd_epi16(xImageAsWords, xImageAsWords);
xImageSqSums = _mm_add_epi32(xProduct, xImageSqSums);
xTemplateAsWords = _mm_unpackhi_epi8(xTemplateAsEightBytes,xZero);
xProduct = _mm_madd_epi16(xImageAsWords, xTemplateAsWords);
xCrossSums = _mm_add_epi32(xProduct, xCrossSums);
nImageSum = SumXMM_16(xImageSums);
nCrossSum = SumXMM_32(xCrossSums);
nImageSumSq = SumXMM_32(xImageSqSums);
}
else
#endif
{
for(int nRow = 0; nRow < mnPatchSize; nRow++)
{
imagepointer = &im[irImgBase + ImageRef(0,nRow)];
templatepointer = &mimTemplate[ImageRef(0,nRow)];
for(int nCol = 0; nCol < mnPatchSize; nCol++)
{
int n = imagepointer[nCol];
nImageSum += n;
nImageSumSq += n*n;
nCrossSum += n * templatepointer[nCol];
};
}
};
int SA = mnTemplateSum;
int SB = nImageSum;
int N = mnPatchSize * mnPatchSize;
return ((2*SA*SB - SA*SA - SB*SB)/N + nImageSumSq + mnTemplateSumSq - 2*nCrossSum);
}