From 12d1760f65dfe8a1b6e45735afc6c330eb5b643a Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Sat, 8 Feb 2025 13:09:04 -0500 Subject: [PATCH] sionly changelog --- CHANGELOG | 7 ++++-- include/cufinufft/impl.h | 7 ------ src/cuda/1d/cufinufft1d.cu | 45 ++++++++++++++++++-------------------- src/cuda/2d/cufinufft2d.cu | 39 +++++++++++++++------------------ src/cuda/3d/cufinufft3d.cu | 45 +++++++++++++++++++------------------- 5 files changed, 66 insertions(+), 77 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index fa30ed54c..b8867a85c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,11 +1,14 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). -Master, using release name V 2.4.0 (1/7/25) +Master, using release name V 2.4.0 (2/8/25) + +* CPU opts.spreadinterponly (experts only), and GPU, logic and docs changed so + upsampfac controls kernel shape properly. Add C++/MATLAB demos. #602 (Barnett) * PR617: Caching pip dependencies in github actions. Forcing Ninja when building python on Windows. * PR614: Added support for sccache in github actions. - Caching cmake dependencies so to avoid downloading fftw, xsimd, etc. every time. + Caching cmake dependencies so to avoid downloading fftw, xsimd, etc every time. * fully removed chkbnds option (opts and spreadopts) (Barnett) * classic GNU makefile settings make.inc.* tidied to make-platforms/ (Barnett) * unified separate-dim arrays (eg X,Y,Z->XYZ), simplifiying core (Reinecke #592) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 20c39934a..f45fd2709 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -155,13 +155,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac); } } - if (d_plan->opts.gpu_spreadinterponly) { - // XNOR implementation below with boolean logic. - if ((d_plan->opts.upsampfac != 1.0) == (type != 3)) { - ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID; - goto finalize; - } - } /* Setup Spreader */ if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index d0328236f..d5387bfec 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -42,11 +42,11 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_cstart = d_c + i * d_plan->batchsize * d_plan->M; d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms; d_plan->c = d_cstart; - d_plan->fk = d_fkstart; + d_plan->fk = d_fkstart; // so deconvolve will write into user output f if (d_plan->opts.gpu_spreadinterponly) - d_plan->fw = d_fkstart; - - // this is needed + d_plan->fw = d_fkstart; // spread directly into user output f + + // this is needed if ((ier = checkCudaErrors(cudaMemsetAsync( d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex), stream)))) @@ -54,11 +54,9 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread1d(d_plan, blksize))) return ier; - // if spreadonly, skip the rest - if (d_plan->opts.gpu_spreadinterponly) - continue; - + if (d_plan->opts.gpu_spreadinterponly) continue; // skip steps 2 and 3 + // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); @@ -103,24 +101,23 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, d_plan->c = d_cstart; d_plan->fk = d_fkstart; - + // Skip steps 1 and 2 if interponly if (!d_plan->opts.gpu_spreadinterponly) { - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if (d_plan->opts.modeord == 0) { - if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; - } else { - if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; - } - - // Step 2: FFT - cufftResult cufft_status = - cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; - } - else - d_plan->fw = d_fkstart; - + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + } else { + if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + } + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + } else + d_plan->fw = d_fkstart; // interpolate directly from user input f + // Step 3: Interpolate if ((ier = cuinterp1d(d_plan, blksize))) return ier; } diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index 916c4c71d..184377234 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -43,9 +43,9 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_cstart = d_c + i * d_plan->batchsize * d_plan->M; d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt; d_plan->c = d_cstart; - d_plan->fk = d_fkstart; + d_plan->fk = d_fkstart; // so deconvolve will write into user output f if (d_plan->opts.gpu_spreadinterponly) - d_plan->fw = d_fkstart; + d_plan->fw = d_fkstart; // spread directly into user output f // this is needed if ((ier = checkCudaErrors(cudaMemsetAsync( @@ -57,10 +57,8 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread2d(d_plan, blksize))) return ier; - // if spreadonly, skip the rest - if (d_plan->opts.gpu_spreadinterponly) - continue; - + if (d_plan->opts.gpu_spreadinterponly) continue; // skip steps 2 and 3 + // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); @@ -108,21 +106,20 @@ int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, // Skip steps 1 and 2 if interponly if (!d_plan->opts.gpu_spreadinterponly) { - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if (d_plan->opts.modeord == 0) { - if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; - } else { - if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; - } - - // Step 2: FFT - cufftResult cufft_status = - cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; - } - else - d_plan->fw = d_fkstart; - + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + } else { + if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + } + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + } else + d_plan->fw = d_fkstart; // interpolate directly from user input f + // Step 3: Interpolate if ((ier = cuinterp2d(d_plan, blksize))) return ier; } diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index 805cdf4bb..663aaa062 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -31,6 +31,7 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, Melody Shih 07/25/19 */ { + assert(d_plan->spopts.spread_direction == 1); auto &stream = d_plan->stream; int ier; cuda_complex *d_fkstart; @@ -41,9 +42,9 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, d_fkstart = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt * d_plan->mu; d_plan->c = d_cstart; - d_plan->fk = d_fkstart; + d_plan->fk = d_fkstart; // so deconvolve will write into user output f if (d_plan->opts.gpu_spreadinterponly) - d_plan->fw = d_fkstart; + d_plan->fw = d_fkstart; // spread directly into user output f if ((ier = checkCudaErrors(cudaMemsetAsync( d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex), @@ -52,12 +53,10 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, // Step 1: Spread if ((ier = cuspread3d(d_plan, blksize))) return ier; - - // if spreadonly, skip the rest - if (d_plan->opts.gpu_spreadinterponly) - continue; - - // Step 2: FFT + + if (d_plan->opts.gpu_spreadinterponly) continue; // skip steps 2 and 3 + + // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; @@ -89,6 +88,7 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, Melody Shih 07/25/19 */ { + assert(d_plan->spopts.spread_direction == 2); int ier; cuda_complex *d_fkstart; cuda_complex *d_cstart; @@ -102,21 +102,20 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, // Skip steps 1 and 2 if interponly if (!d_plan->opts.gpu_spreadinterponly) { - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if (d_plan->opts.modeord == 0) { - if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; - } else { - if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; - } - // Step 2: FFT - RETURN_IF_CUDA_ERROR - cufftResult cufft_status = - cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; - } - else - d_plan->fw = d_fkstart; - + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + } else { + if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + } + // Step 2: FFT + RETURN_IF_CUDA_ERROR + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + } else + d_plan->fw = d_fkstart; // interpolate directly from user input f + // Step 3: Interpolate if ((ier = cuinterp3d(d_plan, blksize))) return ier; }