GPU performance improvements (#488)

* cuFINUFFT binsize is now a function of the shared memory available where possible. * cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort. * cuFINUFFT using the new normalized Horner coefficients and added support for 1.25. * cuFINUFFT new compile flags for extra-vectorization, flushing single precision denormals to 0 and using fma where possible. * cuFINUFFT using intrinsics in foldrescale and other places to increase performance * cuFINUFFT using SM90 float2 vector atomicAdd where supported * cuFINUFFT making default binsize = 0
flatironinstitute · Aug 2, 2024 · b3c2be7 · b3c2be7
1 parent b81c86f
commit b3c2be7
Show file tree

Hide file tree

Showing 38 changed files with 2,220 additions and 1,464 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -54,6 +54,17 @@ V 2.3.0beta (7/24/24)
 * cmake adding nvcc and msvc optimization flags
 * cmake supports sphinx
 * updated install docs
+* cuFINUFFT binsize is now a function of the shared memory available where
+  possible.
+* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
+* cuFINUFFT using the new normalized Horner coefficients and added support
+  for 1.25.
+* cuFINUFFT new compile flags for extra-vectorization, flushing single
+  precision denormals to 0 and using fma where possible.
+* cuFINUFFT using intrinsics in foldrescale and other places to increase
+  performance
+* cuFINUFFT using SM90 float2 vector atomicAdd where supported
+* cuFINUFFT making default binsize = 0
 
 V 2.2.0 (12/12/23)
 

diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt
@@ -2,23 +2,25 @@ project(finufft_devel)
 # Set the minimum required version of CMake
 cmake_minimum_required(VERSION 3.5)
 
-
 # include cpm cmake, downloading it
-CPMAddPackage(
-    NAME benchmark
-    GITHUB_REPOSITORY google/benchmark
-    VERSION 1.8.3
-    OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
-
-)
+cpmaddpackage(
+  NAME
+  benchmark
+  GITHUB_REPOSITORY
+  google/benchmark
+  VERSION
+  1.8.3
+  OPTIONS
+  "BENCHMARK_ENABLE_TESTING OFF")
 
-if (benchmark_ADDED)
-    # patch benchmark target
-    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+if(benchmark_ADDED)
+  # patch benchmark target
+  set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
 endif()
 
 add_executable(foldrescale foldrescale.cpp)
 target_link_libraries(foldrescale finufft benchmark xsimd)
 add_executable(padding padding.cpp)
+target_compile_features(padding PRIVATE cxx_std_17)
 target_link_libraries(padding finufft xsimd)
 target_compile_options(padding PRIVATE -march=native)
diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m
@@ -12,12 +12,12 @@
 
 for upsampfac = [2.0, 1.25];   % sigma: either 2 (default) or low (eg 5/4)
   fprintf('upsampfac = %g...\n',upsampfac)
-  
+
   ws = 2:16;
-  opts.wpad = true;    % pad kernel eval to multiple of 4
+  opts.wpad = false;    % pad kernel eval to multiple of 4
 
-  if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
-  else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
+  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
+  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
   end
   fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_C_code(w,d,beta,opts);
     if j==1                                % write switch statement
-      fwrite(fid,sprintf('  if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  if (w==%d) {\n',w));
     else
-      fwrite(fid,sprintf('  } else if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
     end
     for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
   end

diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m
@@ -38,9 +38,9 @@
   width = w;
 end
 for n=1:d+1                 % loop over poly coeff powers
-  s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
+  s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
   for i=2:width            % loop over segments
-    s = sprintf('%s, %.16E', s, C(n,i));      
+    s = sprintf('%s, %.16E', s, C(n,i));
   end
   str{n} = [s sprintf('};\n')];
 end

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -11,20 +11,23 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)
 
 foreach(EXAMPLE ${EXAMPLES})
   add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+  target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
   target_link_libraries(${EXAMPLE} PRIVATE finufft)
   enable_asan(${EXAMPLE})
 endforeach()
 
 foreach(EXAMPLE ${EXAMPLES_C})
   add_executable(${EXAMPLE} ${EXAMPLE}.c)
   target_link_libraries(${EXAMPLE} PRIVATE finufft)
+  target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
   enable_asan(${EXAMPLE})
 endforeach()
 
 if(FINUFFT_USE_OPENMP)
   foreach(EXAMPLE ${EXAMPLES_OPENMP})
     add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
     target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
+    target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
     enable_asan(${EXAMPLE})
   endforeach()
 endif()
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 file(GLOB example_src "*.cpp")
 
 foreach(srcfile ${example_src})
@@ -7,4 +6,5 @@ foreach(srcfile ${example_src})
   add_executable(${executable} ${srcfile})
   target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
   target_link_libraries(${executable} cufinufft)
+  target_compile_features(${executable} PRIVATE cxx_std_17)
 endforeach()
diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -4,6 +4,7 @@
 #include <cufft.h>
 #include <cufinufft/types.h>
 #include <cufinufft_opts.h>
+#include <finufft_errors.h>
 #include <finufft_spread_opts.h>
 
 #include <complex.h>
@@ -32,6 +33,38 @@ template<typename T>
 void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    T *fwkerhalf, finufft_spread_opts opts);
 
+template<typename T>
+std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
+                                   int bin_size_z);
+
+template<typename T>
+void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
+
+template<typename T, typename V>
+auto cufinufft_set_shared_memory(V *kernel, const int dim,
+                                 const cufinufft_plan_t<T> &d_plan) {
+  /**
+   * WARNING: this function does not handle cuda errors. The caller should check them.
+   */
+  int device_id{}, shared_mem_per_block{};
+  cudaGetDevice(&device_id);
+  const auto shared_mem_required =
+      shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
+                                d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
+  cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                         device_id);
+  if (shared_mem_required > shared_mem_per_block) {
+    fprintf(stderr,
+            "Error: Shared memory required per block is %zu bytes, but the device "
+            "supports only %d bytes.\n",
+            shared_mem_required, shared_mem_per_block);
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       shared_mem_required);
+  return 0;
+}
+
 } // namespace common
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
@@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
   return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
 }
 
-#define RETURN_IF_CUDA_ERROR                                         \
-  {                                                                  \
-    cudaError_t err = cudaGetLastError();                            \
-    if (err != cudaSuccess) {                                        \
-      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
-      return FINUFFT_ERR_CUDA_FAILURE;                               \
-    }                                                                \
+#define RETURN_IF_CUDA_ERROR                                                         \
+  {                                                                                  \
+    cudaError_t err = cudaGetLastError();                                            \
+    if (err != cudaSuccess) {                                                        \
+      printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
+             __FILE__, __LINE__);                                                    \
+      return FINUFFT_ERR_CUDA_FAILURE;                                               \
+    }                                                                                \
   }
 
 #define CUDA_FREE_AND_NULL(val, stream, pool_supported)                              \