From 12d1760f65dfe8a1b6e45735afc6c330eb5b643a Mon Sep 17 00:00:00 2001
From: ahbarnett <alex.h.barnett@gmail.com>
Date: Sat, 8 Feb 2025 13:09:04 -0500
Subject: [PATCH] sionly changelog

---
 CHANGELOG                  |  7 ++++--
 include/cufinufft/impl.h   |  7 ------
 src/cuda/1d/cufinufft1d.cu | 45 ++++++++++++++++++--------------------
 src/cuda/2d/cufinufft2d.cu | 39 +++++++++++++++------------------
 src/cuda/3d/cufinufft3d.cu | 45 +++++++++++++++++++-------------------
 5 files changed, 66 insertions(+), 77 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index fa30ed54c..b8867a85c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,11 +1,14 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-Master, using release name V 2.4.0 (1/7/25)
+Master, using release name V 2.4.0 (2/8/25)
+
+* CPU opts.spreadinterponly (experts only), and GPU, logic and docs changed so
+  upsampfac controls kernel shape properly. Add C++/MATLAB demos. #602 (Barnett)
 * PR617: Caching pip dependencies in github actions.
   Forcing Ninja when building python on Windows.
 * PR614: Added support for sccache in github actions.
-  Caching cmake dependencies so to avoid downloading fftw, xsimd, etc. every time.
+  Caching cmake dependencies so to avoid downloading fftw, xsimd, etc every time.
 * fully removed chkbnds option (opts and spreadopts) (Barnett)
 * classic GNU makefile settings make.inc.* tidied to make-platforms/ (Barnett)
 * unified separate-dim arrays (eg X,Y,Z->XYZ), simplifiying core (Reinecke #592)
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 20c39934a..f45fd2709 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -155,13 +155,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-  if (d_plan->opts.gpu_spreadinterponly) {
-    // XNOR implementation below with boolean logic.
-    if ((d_plan->opts.upsampfac != 1.0) == (type != 3)) {
-      ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
-      goto finalize;
-    }
-  }
   /* Setup Spreader */
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
     // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index d0328236f..d5387bfec 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -42,11 +42,11 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms;
     d_plan->c   = d_cstart;
-    d_plan->fk  = d_fkstart;
+    d_plan->fk  = d_fkstart;  // so deconvolve will write into user output f
     if (d_plan->opts.gpu_spreadinterponly)
-        d_plan->fw = d_fkstart;
-    
-        // this is needed
+      d_plan->fw = d_fkstart; // spread directly into user output f
+
+                              // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
              stream))))
@@ -54,11 +54,9 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
-    // if spreadonly, skip the rest
 
-    if (d_plan->opts.gpu_spreadinterponly)
-        continue;
-    
+    if (d_plan->opts.gpu_spreadinterponly) continue; // skip steps 2 and 3
+
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -103,24 +101,23 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     d_plan->c  = d_cstart;
     d_plan->fk = d_fkstart;
-    
+
     // Skip steps 1 and 2 if interponly
     if (!d_plan->opts.gpu_spreadinterponly) {
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if (d_plan->opts.modeord == 0) {
-          if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
-        } else {
-          if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
-        }
-
-        // Step 2: FFT
-        cufftResult cufft_status =
-            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-    }
-    else
-        d_plan->fw = d_fkstart;
-        
+      // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+      if (d_plan->opts.modeord == 0) {
+        if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
+      } else {
+        if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
+      }
+
+      // Step 2: FFT
+      cufftResult cufft_status =
+          cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+      if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+    } else
+      d_plan->fw = d_fkstart; // interpolate directly from user input f
+
     // Step 3: Interpolate
     if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
   }
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 916c4c71d..184377234 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -43,9 +43,9 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_cstart    = d_c + i * d_plan->batchsize * d_plan->M;
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt;
     d_plan->c   = d_cstart;
-    d_plan->fk  = d_fkstart;
+    d_plan->fk  = d_fkstart;  // so deconvolve will write into user output f
     if (d_plan->opts.gpu_spreadinterponly)
-        d_plan->fw = d_fkstart;
+      d_plan->fw = d_fkstart; // spread directly into user output f
 
     // this is needed
     if ((ier = checkCudaErrors(cudaMemsetAsync(
@@ -57,10 +57,8 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
 
-    // if spreadonly, skip the rest
-    if (d_plan->opts.gpu_spreadinterponly)
-        continue;
-    
+    if (d_plan->opts.gpu_spreadinterponly) continue; // skip steps 2 and 3
+
     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
@@ -108,21 +106,20 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Skip steps 1 and 2 if interponly
     if (!d_plan->opts.gpu_spreadinterponly) {
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if (d_plan->opts.modeord == 0) {
-          if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
-        } else {
-          if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
-        }
-
-        // Step 2: FFT
-        cufftResult cufft_status =
-            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-    }
-    else
-        d_plan->fw = d_fkstart;
-    
+      // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+      if (d_plan->opts.modeord == 0) {
+        if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
+      } else {
+        if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
+      }
+
+      // Step 2: FFT
+      cufftResult cufft_status =
+          cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+      if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+    } else
+      d_plan->fw = d_fkstart; // interpolate directly from user input f
+
     // Step 3: Interpolate
     if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
   }
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index 805cdf4bb..663aaa062 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -31,6 +31,7 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     Melody Shih 07/25/19
 */
 {
+  assert(d_plan->spopts.spread_direction == 1);
   auto &stream = d_plan->stream;
   int ier;
   cuda_complex<T> *d_fkstart;
@@ -41,9 +42,9 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     d_fkstart   = d_fk + i * d_plan->batchsize * d_plan->ms * d_plan->mt * d_plan->mu;
 
     d_plan->c  = d_cstart;
-    d_plan->fk = d_fkstart;
+    d_plan->fk = d_fkstart;   // so deconvolve will write into user output f
     if (d_plan->opts.gpu_spreadinterponly)
-        d_plan->fw = d_fkstart;
+      d_plan->fw = d_fkstart; // spread directly into user output f
 
     if ((ier = checkCudaErrors(cudaMemsetAsync(
              d_plan->fw, 0, d_plan->batchsize * d_plan->nf * sizeof(cuda_complex<T>),
@@ -52,12 +53,10 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Step 1: Spread
     if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
-    
-    // if spreadonly, skip the rest
-    if (d_plan->opts.gpu_spreadinterponly)
-        continue;
-    
-        // Step 2: FFT
+
+    if (d_plan->opts.gpu_spreadinterponly) continue; // skip steps 2 and 3
+
+                                                     // Step 2: FFT
     cufftResult cufft_status =
         cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
     if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
@@ -89,6 +88,7 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     Melody Shih 07/25/19
 */
 {
+  assert(d_plan->spopts.spread_direction == 2);
   int ier;
   cuda_complex<T> *d_fkstart;
   cuda_complex<T> *d_cstart;
@@ -102,21 +102,20 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
 
     // Skip steps 1 and 2 if interponly
     if (!d_plan->opts.gpu_spreadinterponly) {
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if (d_plan->opts.modeord == 0) {
-          if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
-        } else {
-          if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
-        }
-        // Step 2: FFT
-        RETURN_IF_CUDA_ERROR
-        cufftResult cufft_status =
-            cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
-    }
-    else
-        d_plan->fw = d_fkstart;
-        
+      // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+      if (d_plan->opts.modeord == 0) {
+        if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
+      } else {
+        if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
+      }
+      // Step 2: FFT
+      RETURN_IF_CUDA_ERROR
+      cufftResult cufft_status =
+          cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+      if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+    } else
+      d_plan->fw = d_fkstart; // interpolate directly from user input f
+
     // Step 3: Interpolate
     if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
   }