Skip to content

Commit

Permalink
GPU performance improvements (#488)
Browse files Browse the repository at this point in the history
* cuFINUFFT binsize is now a function of the shared memory available where
  possible.
* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
* cuFINUFFT using the new normalized Horner coefficients and added support
  for 1.25.
* cuFINUFFT new compile flags for extra-vectorization, flushing single
  precision denormals to 0 and using fma where possible.
* cuFINUFFT using intrinsics in foldrescale and other places to increase
  performance
* cuFINUFFT using SM90 float2 vector atomicAdd where supported
* cuFINUFFT making default binsize = 0
  • Loading branch information
DiamonDinoia authored Aug 2, 2024
1 parent b81c86f commit b3c2be7
Show file tree
Hide file tree
Showing 38 changed files with 2,220 additions and 1,464 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,17 @@ V 2.3.0beta (7/24/24)
* cmake adding nvcc and msvc optimization flags
* cmake supports sphinx
* updated install docs
* cuFINUFFT binsize is now a function of the shared memory available where
possible.
* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
* cuFINUFFT using the new normalized Horner coefficients and added support
for 1.25.
* cuFINUFFT new compile flags for extra-vectorization, flushing single
precision denormals to 0 and using fma where possible.
* cuFINUFFT using intrinsics in foldrescale and other places to increase
performance
* cuFINUFFT using SM90 float2 vector atomicAdd where supported
* cuFINUFFT making default binsize = 0

V 2.2.0 (12/12/23)

Expand Down
24 changes: 13 additions & 11 deletions devel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@ project(finufft_devel)
# Set the minimum required version of CMake
cmake_minimum_required(VERSION 3.5)


# include cpm cmake, downloading it
CPMAddPackage(
NAME benchmark
GITHUB_REPOSITORY google/benchmark
VERSION 1.8.3
OPTIONS "BENCHMARK_ENABLE_TESTING OFF"

)
cpmaddpackage(
NAME
benchmark
GITHUB_REPOSITORY
google/benchmark
VERSION
1.8.3
OPTIONS
"BENCHMARK_ENABLE_TESTING OFF")

if (benchmark_ADDED)
# patch benchmark target
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
if(benchmark_ADDED)
# patch benchmark target
set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
endif()

add_executable(foldrescale foldrescale.cpp)
target_link_libraries(foldrescale finufft benchmark xsimd)
add_executable(padding padding.cpp)
target_compile_features(padding PRIVATE cxx_std_17)
target_link_libraries(padding finufft xsimd)
target_compile_options(padding PRIVATE -march=native)
12 changes: 6 additions & 6 deletions devel/gen_all_horner_C_code.m
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@

for upsampfac = [2.0, 1.25]; % sigma: either 2 (default) or low (eg 5/4)
fprintf('upsampfac = %g...\n',upsampfac)

ws = 2:16;
opts.wpad = true; % pad kernel eval to multiple of 4
opts.wpad = false; % pad kernel eval to multiple of 4

if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
end
fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
Expand All @@ -27,9 +27,9 @@
fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
str = gen_ker_horner_loop_C_code(w,d,beta,opts);
if j==1 % write switch statement
fwrite(fid,sprintf(' if constexpr(w==%d) {\n',w));
fwrite(fid,sprintf(' if (w==%d) {\n',w));
else
fwrite(fid,sprintf(' } else if constexpr(w==%d) {\n',w));
fwrite(fid,sprintf(' } else if (w==%d) {\n',w));
end
for i=1:numel(str); fwrite(fid,[' ',str{i}]); end
end
Expand Down
4 changes: 2 additions & 2 deletions devel/gen_ker_horner_loop_C_code.m
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@
width = w;
end
for n=1:d+1 % loop over poly coeff powers
s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
for i=2:width % loop over segments
s = sprintf('%s, %.16E', s, C(n,i));
s = sprintf('%s, %.16E', s, C(n,i));
end
str{n} = [s sprintf('};\n')];
end
Expand Down
3 changes: 3 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,23 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)

foreach(EXAMPLE ${EXAMPLES})
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
target_link_libraries(${EXAMPLE} PRIVATE finufft)
enable_asan(${EXAMPLE})
endforeach()

foreach(EXAMPLE ${EXAMPLES_C})
add_executable(${EXAMPLE} ${EXAMPLE}.c)
target_link_libraries(${EXAMPLE} PRIVATE finufft)
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
enable_asan(${EXAMPLE})
endforeach()

if(FINUFFT_USE_OPENMP)
foreach(EXAMPLE ${EXAMPLES_OPENMP})
add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX)
target_compile_features(${EXAMPLE} PRIVATE cxx_std_17)
enable_asan(${EXAMPLE})
endforeach()
endif()
2 changes: 1 addition & 1 deletion examples/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

file(GLOB example_src "*.cpp")

foreach(srcfile ${example_src})
Expand All @@ -7,4 +6,5 @@ foreach(srcfile ${example_src})
add_executable(${executable} ${srcfile})
target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
target_link_libraries(${executable} cufinufft)
target_compile_features(${executable} PRIVATE cxx_std_17)
endforeach()
33 changes: 33 additions & 0 deletions include/cufinufft/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cufft.h>
#include <cufinufft/types.h>
#include <cufinufft_opts.h>
#include <finufft_errors.h>
#include <finufft_spread_opts.h>

#include <complex.h>
Expand Down Expand Up @@ -32,6 +33,38 @@ template<typename T>
void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
T *fwkerhalf, finufft_spread_opts opts);

template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
int bin_size_z);

template<typename T>
void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);

template<typename T, typename V>
auto cufinufft_set_shared_memory(V *kernel, const int dim,
const cufinufft_plan_t<T> &d_plan) {
/**
* WARNING: this function does not handle cuda errors. The caller should check them.
*/
int device_id{}, shared_mem_per_block{};
cudaGetDevice(&device_id);
const auto shared_mem_required =
shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
device_id);
if (shared_mem_required > shared_mem_per_block) {
fprintf(stderr,
"Error: Shared memory required per block is %zu bytes, but the device "
"supports only %d bytes.\n",
shared_mem_required, shared_mem_per_block);
return FINUFFT_ERR_INSUFFICIENT_SHMEM;
}
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
shared_mem_required);
return 0;
}

} // namespace common
} // namespace cufinufft
#endif
15 changes: 8 additions & 7 deletions include/cufinufft/contrib/helper_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
}

#define RETURN_IF_CUDA_ERROR \
{ \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
return FINUFFT_ERR_CUDA_FAILURE; \
} \
#define RETURN_IF_CUDA_ERROR \
{ \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
__FILE__, __LINE__); \
return FINUFFT_ERR_CUDA_FAILURE; \
} \
}

#define CUDA_FREE_AND_NULL(val, stream, pool_supported) \
Expand Down
Loading

0 comments on commit b3c2be7

Please sign in to comment.