diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml index b0ab6338b..da12ecbca 100644 --- a/.github/workflows/cmake_ci.yml +++ b/.github/workflows/cmake_ci.yml @@ -55,7 +55,7 @@ jobs: id: cache uses: actions/cache@v4 with: - key: cpm-cache-00-${{ hashFiles('CMakeLists.txt', 'cmake/*') }} + key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }} enableCrossOsArchive: true path: cpm - name: Setup Cpp @@ -82,7 +82,7 @@ jobs: - name: Cache dependencies uses: actions/cache/save@v4 with: - key: cpm-cache-00-${{ hashFiles('CMakeLists.txt', 'cmake/*') }} + key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }} enableCrossOsArchive: true path: cpm cmake-ci: @@ -97,18 +97,16 @@ jobs: - name: Restore Cache uses: actions/cache/restore@v4 with: - key: cpm-cache-00-${{ hashFiles('CMakeLists.txt', 'cmake/*') }} + key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }} enableCrossOsArchive: true path: cpm - name: Download requirements.txt uses: actions/download-artifact@v4 with: name: requirements - - name: Run sccache-cache only on non-release runs - if: github.event_name != 'release' && github.event_name != 'workflow_dispatch' + - name: Run sccache-cache uses: mozilla-actions/sccache-action@v0.0.7 - - name: Set caching env vars only on non-release runs - if: github.event_name != 'release' && github.event_name != 'workflow_dispatch' + - name: Set caching env vars run: | echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV echo "SCCACHE_GHA_VERSION=0" >> $GITHUB_ENV diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d21cfc3f..f64bd7812 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,7 @@ if(FINUFFT_USE_CPU) endif() set(FFTW_VERSION 3.3.10) set(XTL_VERSION 0.7.7) - set(XSIMD_VERSION 13.1.0) + set(XSIMD_VERSION 13.2.0) set(DUCC0_VERSION ducc0_0_36_0) set(FINUFFT_FFTW_LIBRARIES) include(cmake/setupXSIMD.cmake) diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index 5be96eae9..4d8de4283 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -1,6 +1,11 @@ #ifndef FINUFFT_CORE_H #define FINUFFT_CORE_H +#include + +#include +#include + /* IMPORTANT: for Windows compilers, you should add a line #define FINUFFT_DLL here if you are compiling/using FINUFFT as a DLL, @@ -59,10 +64,6 @@ #define FINUFFT_LIKELY(x) (x) #endif -#include -#include -#include - // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. using BIGINT = int64_t; diff --git a/include/finufft/finufft_utils.hpp b/include/finufft/finufft_utils.hpp index 7577a57a1..ff495b491 100644 --- a/include/finufft/finufft_utils.hpp +++ b/include/finufft/finufft_utils.hpp @@ -12,7 +12,6 @@ // using chrono since the interface is portable between linux and windows namespace finufft::utils { - template FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arrayrange(BIGINT n, const T *a, T *lo, T *hi) @@ -59,8 +58,9 @@ class FINUFFT_EXPORT CNTime { double initial; }; -// openmp helpers -int get_num_threads_parallel_block(); +FINUFFT_NEVER_INLINE int getOptimalThreadCount(); + +FINUFFT_NEVER_INLINE int get_num_threads_parallel_block(); } // namespace finufft::utils diff --git a/include/finufft_errors.h b/include/finufft_errors.h index 0d8c36042..e1d56261b 100644 --- a/include/finufft_errors.h +++ b/include/finufft_errors.h @@ -26,5 +26,6 @@ enum { FINUFFT_ERR_NUM_NU_PTS_INVALID = 20, FINUFFT_ERR_INVALID_ARGUMENT = 21, FINUFFT_ERR_LOCK_FUNS_INVALID = 22, + FINUFFT_ERR_NTHREADS_NOTVALID = 23, }; #endif diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 6342c7856..94d0fcdc2 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -592,17 +592,18 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i #ifdef _OPENMP // choose overall # threads... - int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); - int nthr = ompmaxnthr; // default: use as many as OMP gives us + int ompmaxnthr = getOptimalThreadCount(); + int nthr = ompmaxnthr; // default: use as many physical cores as possible // (the above could be set, or suggested set, to 1 for small enough problems...) if (opts.nthreads > 0) { nthr = opts.nthreads; // user override, now without limit if (opts.showwarn && (nthr > ompmaxnthr)) fprintf(stderr, - "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " + "%s warning: using opts.nthreads=%d, more than the %d physically cores " "available; note large nthreads can be slower.\n", __func__, nthr, ompmaxnthr); } + #else int nthr = 1; // always 1 thread (avoid segfault) if (opts.nthreads > 1) @@ -611,6 +612,17 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i __func__, opts.nthreads); #endif opts.nthreads = nthr; // store actual # thr planned for + if (opts.debug > 1) { + printf("[%s] opts.nthreads=%d\n", __func__, nthr); + } + + if (opts.nthreads == 0) { + fprintf(stderr, + "[%s] error: detecting physical corers failed. Please specify the number " + "of cores to use\n", + __func__); + throw int(FINUFFT_ERR_NTHREADS_NOTVALID); + } // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) diff --git a/src/finufft_utils.cpp b/src/finufft_utils.cpp index 8bcf8ddab..15a22d7ab 100644 --- a/src/finufft_utils.cpp +++ b/src/finufft_utils.cpp @@ -3,14 +3,28 @@ // For self-test see ../test/testutils.cpp. Barnett 2017-2020. -#include - #include -using namespace std; +#include +#include +#include -namespace finufft::utils { +#if defined(_WIN32) +#include +#include +#elif defined(__APPLE__) +#include +#include +#elif defined(__linux__) +#ifndef _GNU_SOURCE +#define _GNU_SOURCE // Enable GNU extensions for sched_getaffinity +#endif +#include +#include +#include +#endif +namespace finufft::utils { BIGINT next235even(BIGINT n) // finds even integer not less than n, with prime factors no larger than 5 // (ie, "smooth"). Adapted from fortran in hellskitchen. Barnett 2/9/17 @@ -57,6 +71,181 @@ double CNTime::elapsedsec() const return nowsec - initial; } +namespace { +#if defined(_WIN32) +// Returns the number of physical CPU cores on Windows (excluding hyper-threaded cores) +static int getPhysicalCoreCount() { + int physicalCoreCount = 0; + + // Determine the required buffer size. + DWORD bufferSize = 0; + if (GetLogicalProcessorInformation(nullptr, &bufferSize) == FALSE && + GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return physicalCoreCount; + } + + // Calculate the number of entries and allocate a vector. + size_t entryCount = bufferSize / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + std::vector procInfo(entryCount); + if (GetLogicalProcessorInformation(procInfo.data(), &bufferSize) != FALSE) { + for (const auto &info : procInfo) { + if (info.Relationship == RelationProcessorCore) ++physicalCoreCount; + } + } + + if (physicalCoreCount == 0) { + return MY_OMP_GET_MAX_THREADS(); + } + return physicalCoreCount; +} + +static int getAllowedCoreCount() { + DWORD_PTR processMask, systemMask; + if (!GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask)) { + return 0; // API call failed (should rarely happen for the current process) + } + // Count bits in processMask + int count = 0; + while (processMask) { + count += static_cast(processMask & 1U); + processMask >>= 1; + } + return count; +} + +#elif defined(__APPLE__) + +// Returns the number of physical CPU cores on macOS (excluding hyper-threaded cores) +static int getPhysicalCoreCount() { + int physicalCoreCount = 0; + int cores = 0; + size_t size = sizeof(cores); + if (sysctlbyname("hw.physicalcpu", &cores, &size, nullptr, 0) == 0) { + physicalCoreCount = cores; + } + + if (physicalCoreCount == 0) { + return MY_OMP_GET_MAX_THREADS(); + } + return physicalCoreCount; +} + +static int getAllowedCoreCount() { + // MacOS does not support CPU affinity, so we return the maximum number of threads. + return MY_OMP_GET_MAX_THREADS(); +} + +#elif defined(__linux__) +// Returns the number of physical CPU cores on Linux (excluding hyper-threaded cores) +static int getPhysicalCoreCount() { + int physicalCoreCount = 0; + std::ifstream cpuinfo("/proc/cpuinfo"); + if (!cpuinfo.is_open()) return MY_OMP_GET_MAX_THREADS(); + + std::set coreSet; + std::string line; + int physicalId = -1, coreId = -1; + bool foundPhysical = false, foundCore = false; + + while (std::getline(cpuinfo, line)) { + // An empty line indicates the end of a processor block. + if (line.empty()) { + if (foundPhysical && foundCore) + coreSet.insert(std::to_string(physicalId) + "-" + std::to_string(coreId)); + // Reset for the next processor block. + foundPhysical = foundCore = false; + physicalId = coreId = -1; + } else { + auto colonPos = line.find(':'); + if (colonPos == std::string::npos) continue; + std::string key = line.substr(0, colonPos); + std::string value = line.substr(colonPos + 1); + // Trim whitespace. + key.erase(key.find_last_not_of(" \t") + 1); + value.erase(0, value.find_first_not_of(" \t")); + + if (key == "physical id") { + physicalId = std::stoi(value); + foundPhysical = true; + } else if (key == "core id") { + coreId = std::stoi(value); + foundCore = true; + } + } + } + // In case the file doesn't end with an empty line. + if (foundPhysical && foundCore) + coreSet.insert(std::to_string(physicalId) + "-" + std::to_string(coreId)); + + if (!coreSet.empty()) { + physicalCoreCount = static_cast(coreSet.size()); + } else { + // Fallback: try reading "cpu cores" from the first processor block. + cpuinfo.clear(); + cpuinfo.seekg(0, std::ios::beg); + while (std::getline(cpuinfo, line)) { + auto colonPos = line.find(':'); + if (colonPos != std::string::npos) { + std::string key = line.substr(0, colonPos); + std::string value = line.substr(colonPos + 1); + key.erase(key.find_last_not_of(" \t") + 1); + value.erase(0, value.find_first_not_of(" \t")); + if (key == "cpu cores") { + physicalCoreCount = std::stoi(value); + break; + } + } + } + } + + if (physicalCoreCount == 0) { + return MY_OMP_GET_MAX_THREADS(); + } + return physicalCoreCount; +} + +static int getAllowedCoreCount() { + cpu_set_t cpuSet; + CPU_ZERO(&cpuSet); + if (sched_getaffinity(0, sizeof(cpu_set_t), &cpuSet) != 0) { + return 0; // Error (e.g., not supported or failed) + } + int count = 0; + for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) { + if (CPU_ISSET(cpu, &cpuSet)) { + ++count; + } + } + return count; +} + +#else + +#warning "Unknown platform. Impossible to detect the number of physical cores." +// Fallback version if none of the above platforms is detected. +static int getPhysicalCoreCount() { return MY_OMP_GET_MAX_THREADS(); } +static int getAllowedCoreCount() { return MY_OMP_GET_MAX_THREADS(); } + +#endif + +} // namespace + +int getOptimalThreadCount() { + // if the user has set the OMP_NUM_THREADS environment variable, use that value + const auto OMP_THREADS = std::getenv("OMP_NUM_THREADS"); + if (OMP_THREADS) { + return std::stoi(OMP_THREADS); + } + // otherwise, use the min between number of physical cores or the number of allowed + // cores (e.g. by taskset) + const auto physicalCores = getPhysicalCoreCount(); + const auto allowedCores = getAllowedCoreCount(); + if (physicalCores < allowedCores) { + return physicalCores; + } + return allowedCores; +} + // -------------------------- openmp helpers ------------------------------- int get_num_threads_parallel_block() // return how many threads an omp parallel block would use. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7f2184fec..51c1de72d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,7 +36,7 @@ target_compile_features(testutils PRIVATE cxx_std_17) finufft_link_test(testutils) add_test(NAME run_testutils COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) -if(NOT FINUFFT_USE_DUCC0) +if(NOT FINUFFT_USE_DUCC0 AND FINUFFT_USE_OPENMP) add_executable(fftw_lock_test fftw_lock_test.cpp) target_compile_features(fftw_lock_test PRIVATE cxx_std_17) finufft_link_test(fftw_lock_test)