Skip to content

Commit

Permalink
Changes to use shorter and faster FFTs in correlations
Browse files Browse the repository at this point in the history
  • Loading branch information
scottransom committed Feb 3, 2018
1 parent 24ead36 commit e4e5851
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 42 deletions.
2 changes: 2 additions & 0 deletions include/accel.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ typedef struct accelobs{
long long numbins; /* Number of spectral bins in the file */
long long lobin; /* Lowest spectral bin present in the file */
long long highestbin;/* Highest spectral bin present in the file */
int maxkernlen; /* Maximum full width (in points, not Fourier bins) of corr kernels */
int corr_uselen; /* Number of good data points we will get from high-harm correlations */
int fftlen; /* Length of short FFTs to us in search */
int numharmstages; /* Number of stages of harmonic summing */
int numz; /* Number of f-dots searched */
Expand Down
10 changes: 9 additions & 1 deletion include/presto.h
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,15 @@ float percolate_bin(binaryprops * list, int nlist);

/* From prep_corr.c */

void spread_with_pad(fcomplex *data, int numdata, \
int next_good_fftlen(int N);
/* Return one of the shortest, yet best performing, FFT lengths larger
* than N. This assumes FFTW. */

int fftlen_from_kernwidth(int kernwidth);
/* return the length of the optimal FFT to use for correlations with
* some kernel width kernwidth. This assumes FFTW. */

void spread_with_pad(fcomplex *data, int numdata, \
fcomplex *result, int numresult, \
int numbetween, int numpad);
/* Prepare the data array for correlation by spreading */
Expand Down
81 changes: 44 additions & 37 deletions src/accel_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,21 @@ static void compare_rzw_cands(fourierprops * list, int nlist, char *notes)
}


static int calc_fftlen(int numharm, int harmnum, int max_zfull, int max_wfull)
static int calc_fftlen(int numharm, int harmnum, int max_zfull, int max_wfull, accelobs * obs)
/* The fft length needed to properly process a subharmonic */
{
int bins_needed, end_effects;
double harm_fract;

harm_fract = (double) harmnum / (double) numharm;
bins_needed = (ACCEL_USELEN * harmnum) / numharm + 2;
bins_needed = (int) ceil(obs->corr_uselen * harm_fract) + 2;
printf("harm_fract = %f harmnum = %d numharm = %d\n", harm_fract, harmnum, numharm);
end_effects = 2 * ACCEL_NUMBETWEEN *
w_resp_halfwidth(calc_required_z(harm_fract, max_zfull),
calc_required_w(harm_fract, max_wfull), LOWACC);
//printf("bins_needed = %d end_effects = %d FFTlen = %lld\n",
// bins_needed, end_effects, next2_to_n(bins_needed + end_effects));
return next2_to_n(bins_needed + end_effects);
w_resp_halfwidth(calc_required_z(harm_fract, max_zfull),
calc_required_w(harm_fract, max_wfull), LOWACC);
printf("bins_needed = %d end_effects = %d FFTlen = %d\n",
bins_needed, end_effects, next_good_fftlen(bins_needed + end_effects));
return next_good_fftlen(bins_needed + end_effects);
}


Expand Down Expand Up @@ -199,7 +200,7 @@ kernel **gen_kernmatrix(int numz, int numw) {
}


static void init_subharminfo(int numharm, int harmnum, int zmax, int wmax, subharminfo * shi)
static void init_subharminfo(int numharm, int harmnum, int zmax, int wmax, subharminfo * shi, accelobs * obs)
/* Note: 'zmax' is the overall maximum 'z' in the search while
'wmax' is the overall maximum 'w' in the search */
{
Expand All @@ -212,10 +213,16 @@ static void init_subharminfo(int numharm, int harmnum, int zmax, int wmax, subha
shi->zmax = calc_required_z(harm_fract, zmax);
shi->wmax = calc_required_w(harm_fract, wmax);
if (numharm > 1) {
shi->rinds = (unsigned short *) malloc(ACCEL_USELEN * sizeof(unsigned short));
shi->zinds = (unsigned short *) malloc(ACCEL_USELEN * sizeof(unsigned short));
shi->rinds = (unsigned short *) malloc(obs->corr_uselen * sizeof(unsigned short));
shi->zinds = (unsigned short *) malloc(obs->corr_uselen * sizeof(unsigned short));
}
fftlen = calc_fftlen(numharm, harmnum, zmax, wmax);
if (numharm==1 && harmnum==1) {
fftlen = obs->fftlen;
printf("for fundamental: bins_needed = %d maxkern = %d fftlen = %d\n",
obs->corr_uselen, obs->maxkernlen, obs->fftlen);
}
else
fftlen = calc_fftlen(numharm, harmnum, zmax, wmax, obs);
shi->numkern_zdim = (shi->zmax / ACCEL_DZ) * 2 + 1;
shi->numkern_wdim = (shi->wmax / ACCEL_DW) * 2 + 1;
shi->numkern = shi->numkern_zdim * shi->numkern_wdim;
Expand All @@ -240,8 +247,8 @@ subharminfo **create_subharminfos(accelobs * obs)
shis = (subharminfo **) malloc(obs->numharmstages * sizeof(subharminfo *));
/* Prep the fundamental (actually, the highest harmonic) */
shis[0] = (subharminfo *) malloc(2 * sizeof(subharminfo));
init_subharminfo(1, 1, (int) obs->zhi, (int) obs->whi, &shis[0][0]);
fftlen = calc_fftlen(1, 1, (int) obs->zhi, (int) obs->whi);
init_subharminfo(1, 1, (int) obs->zhi, (int) obs->whi, &shis[0][0], obs);
fftlen = obs->fftlen;
kern_ram_use += shis[0][0].numkern * fftlen * sizeof(fcomplex); // in Bytes
if (obs->numw)
printf(" Harm 1/1 : %5d kernels, %4d < z < %-4d and %5d < w < %-5d (%5d pt FFTs)\n",
Expand All @@ -257,8 +264,8 @@ subharminfo **create_subharminfos(accelobs * obs)
shis[ii] = (subharminfo *) malloc(harmtosum * sizeof(subharminfo));
for (jj = 1; jj < harmtosum; jj += 2) {
init_subharminfo(harmtosum, jj, (int) obs->zhi,
(int) obs->whi, &shis[ii][jj - 1]);
fftlen = calc_fftlen(harmtosum, jj, (int) obs->zhi, (int) obs->whi);
(int) obs->whi, &shis[ii][jj - 1], obs);
fftlen = calc_fftlen(harmtosum, jj, (int) obs->zhi, (int) obs->whi, obs);
kern_ram_use += shis[ii][jj - 1].numkern * fftlen * sizeof(fcomplex); // in Bytes
if (obs->numw)
printf(" Harm %2d/%-2d: %5d kernels, %4d < z < %-4d and %5d < w < %-5d (%5d pt FFTs)\n",
Expand Down Expand Up @@ -1014,22 +1021,10 @@ ffdotpows *subharm_fderivs_vol(int numharm, int harmnum,
int ii, numdata, fftlen, binoffset;
long long lobin;
float powargr, powargi;
static int numrs_full = 0;
double drlo, drhi, harm_fract;
ffdotpows *ffdot;
fcomplex *data, *pdata;
fftwf_plan invplan;

if (numrs_full == 0) {
if (numharm == 1 && harmnum == 1) {
numrs_full = ACCEL_USELEN;
} else {
printf("You must call subharm_fderivs_vol() with numharm=1 and\n");
printf("harnum=1 before you use other values! Exiting.\n\n");
exit(0);
}
}
ffdot = (ffdotpows *) malloc(sizeof(ffdotpows));
ffdotpows *ffdot = (ffdotpows *) malloc(sizeof(ffdotpows));

/* Calculate and get the required amplitudes */
harm_fract = (double) harmnum / (double) numharm;
Expand All @@ -1042,7 +1037,7 @@ ffdotpows *subharm_fderivs_vol(int numharm, int harmnum,
/* Initialize the lookup indices */
if (numharm > 1 && !obs->inmem) {
double rr, subr;
for (ii = 0; ii < numrs_full; ii++) {
for (ii = 0; ii < obs->corr_uselen; ii++) {
rr = fullrlo + ii * ACCEL_DR;
subr = calc_required_r(harm_fract, rr);
shi->rinds[ii] = index_from_r(subr, ffdot->rlo);
Expand All @@ -1056,14 +1051,16 @@ ffdotpows *subharm_fderivs_vol(int numharm, int harmnum,
}
ffdot->rinds = shi->rinds;
ffdot->numrs = (int) ((ceil(drhi) - floor(drlo))
* ACCEL_RDR + DBLCORRECT) + 1;
* ACCEL_RDR + DBLCORRECT);
if (numharm == 1 && harmnum == 1) {
ffdot->numrs = ACCEL_USELEN;
ffdot->numrs = obs->corr_uselen;
} else {
if (ffdot->numrs % ACCEL_RDR) {
ffdot->numrs = (ffdot->numrs / ACCEL_RDR + 1) * ACCEL_RDR;
}
}
//printf("%d/%d fullrlo = %f fullrhi = %f drlo = %f drhi = %f ffdot->numrs = %d\n",
// harmnum, numharm, fullrlo, fullrhi, drlo, drhi, ffdot->numrs);
ffdot->zinds = shi->zinds;
ffdot->numzs = shi->numkern_zdim;
ffdot->numws = shi->numkern_wdim;
Expand Down Expand Up @@ -1226,7 +1223,7 @@ void fund_to_ffdotplane(ffdotpows * ffd, accelobs * obs)
// This moves the fundamental's ffdot plane powers
// into the one for the full array
int ii;
long long rlen = (obs->highestbin + ACCEL_USELEN) * ACCEL_RDR;
long long rlen = (obs->highestbin + obs->corr_uselen) * ACCEL_RDR;
long long offset;
float *outpow;

Expand Down Expand Up @@ -1334,7 +1331,7 @@ void inmem_add_ffdotpows(ffdotpows * fundamental, accelobs * obs,
#endif
{
const int zlo = fundamental->zlo;
const long long rlen = (obs->highestbin + ACCEL_USELEN) * ACCEL_RDR;
const long long rlen = (obs->highestbin + obs->corr_uselen) * ACCEL_RDR;
float *powptr = fundamental->powers[0][0];
float *fdp = obs->ffdotplane;
int ii, jj, zz, zind, subz;
Expand Down Expand Up @@ -1670,7 +1667,7 @@ void create_accelobs(accelobs * obs, infodata * idata, Cmdline * cmd, int usemma
}

/* Determine the other parameters */

if (cmd->zmax % ACCEL_DZ)
cmd->zmax = (cmd->zmax / ACCEL_DZ + 1) * ACCEL_DZ;
obs->N = (long long) idata->N;
Expand Down Expand Up @@ -1839,19 +1836,29 @@ void create_accelobs(accelobs * obs, infodata * idata, Cmdline * cmd, int usemma
obs->numzap = 0;
*/

/* Determine corr_uselen from zmax and wmax */
obs->maxkernlen = 2 * ACCEL_NUMBETWEEN * \
w_resp_halfwidth(obs->zhi, obs->whi, LOWACC);
obs->fftlen = fftlen_from_kernwidth(obs->maxkernlen);
obs->corr_uselen = obs->fftlen - obs->maxkernlen;
// Make sure that obs->corr_uselen is an integer number of
// full (i.e. un-interpolated) Fourier bins
if (obs->corr_uselen % ACCEL_RDR)
obs->corr_uselen = obs->corr_uselen / ACCEL_RDR * ACCEL_RDR;

/* Can we perform the search in-core memory? */
{
long long memuse;
double gb = (double) (1L << 30);

// This is the size of powers covering the full f-dot-dot plane to search
// Need the extra ACCEL_USELEN since we generate the plane in blocks
// Need the extra obs->corr_uselen since we generate the plane in blocks
if (cmd->wmaxP) {
memuse = sizeof(float) * (obs->highestbin + ACCEL_USELEN)
memuse = sizeof(float) * (obs->highestbin + obs->corr_uselen)
* obs->numbetween * obs->numz * obs->numw;
printf("Full f-dot-dot volume would need %.2f GB: ", (float) memuse / gb);
} else {
memuse = sizeof(float) * (obs->highestbin + ACCEL_USELEN)
memuse = sizeof(float) * (obs->highestbin + obs->corr_uselen)
* obs->numbetween * obs->numz;
printf("Full f-fdot plane would need %.2f GB: ", (float) memuse / gb);
}
Expand Down
11 changes: 7 additions & 4 deletions src/accelsearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ static void print_percent_complete(int current, int number, char *what, int rese

int main(int argc, char *argv[])
{
int ii;
int ii, rstep;
double ttim, utim, stim, tott;
struct tms runtimes;
subharminfo **subharminfs;
Expand Down Expand Up @@ -79,6 +79,9 @@ int main(int argc, char *argv[])
/* Create the accelobs structure */
create_accelobs(&obs, &idata, cmd, 1);

/* The step-size of blocks to walk through the input data */
rstep = obs.corr_uselen * ACCEL_DR;

/* Zap birdies if requested and if in memory */
if (cmd->zaplistP && !obs.mmap_file && obs.fft) {
int numbirds;
Expand Down Expand Up @@ -165,7 +168,7 @@ int main(int argc, char *argv[])
lastr = 0;
nextr = 0;
while (startr < obs.rlo) {
nextr = startr + ACCEL_USELEN * ACCEL_DR;
nextr = startr + rstep;
lastr = nextr - ACCEL_DR;
// Compute the F-Fdot plane
fundamental = subharm_fderivs_vol(1, 1, startr, lastr,
Expand All @@ -181,11 +184,11 @@ int main(int argc, char *argv[])
startr = obs.rlo;
lastr = 0;
nextr = 0;
while (startr + ACCEL_USELEN * ACCEL_DR < obs.highestbin) {
while (startr + rstep < obs.highestbin) {
/* Search the fundamental */
print_percent_complete(startr - obs.rlo,
obs.highestbin - obs.rlo, "search", 0);
nextr = startr + ACCEL_USELEN * ACCEL_DR;
nextr = startr + rstep;
lastr = nextr - ACCEL_DR;
fundamental = subharm_fderivs_vol(1, 1, startr, lastr,
&subharminfs[0][0], &obs);
Expand Down
34 changes: 34 additions & 0 deletions src/corr_prep.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,39 @@
#include "presto.h"

int next_good_fftlen(int N)
/* Return one of the shortest, yet best performing, FFT lengths larger
* than N. This assumes FFTW. */
{
int fftlens[17] = {128, 192, 256, 384, 512, 768, 1024, 1280, 2048, 4096,
5120, 7680, 10240, 12288, 15360, 16384, 25600};
int ii = 0;
if (N <= fftlens[0])
return fftlens[0];
if (N > fftlens[16])
return next2_to_n(N);
while (N > fftlens[ii]) ii++;
return fftlens[ii];
}

int fftlen_from_kernwidth(int kernwidth)
/* return the length of the optimal FFT to use for correlations with
* some kernel width kernwidth. This assumes FFTW. */
{
// The following nummbers were determined using FFTW 3.3.7 on an
// AVX-enabled processor. Metric used was max throughput of good
// correlated data.
if (kernwidth < 6) return 128;
else if (kernwidth < 52) return 256;
else if (kernwidth < 67) return 512;
else if (kernwidth < 378) return 1024;
else if (kernwidth < 664) return 2048;
else if (kernwidth < 1672) return 4096;
else if (kernwidth < 3015) return 10240;
else if (kernwidth < 3554) return 15360;
else if (kernwidth < 6000) return 25600;
else return next2_to_n(kernwidth*5);
}

void spread_with_pad(fcomplex * data, int numdata,
fcomplex * result, int numresult, int numbetween, int numpad)
/* Prepare the data array for correlation by spreading */
Expand Down

0 comments on commit e4e5851

Please sign in to comment.