diff --git a/.github/workflows/cmake_ci.yml b/.github/workflows/cmake_ci.yml
index b0ab6338b..da12ecbca 100644
--- a/.github/workflows/cmake_ci.yml
+++ b/.github/workflows/cmake_ci.yml
@@ -55,7 +55,7 @@ jobs:
         id: cache
         uses: actions/cache@v4
         with:
-          key: cpm-cache-00-${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
+          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
           enableCrossOsArchive: true
           path: cpm
       - name: Setup Cpp
@@ -82,7 +82,7 @@ jobs:
       - name: Cache dependencies
         uses: actions/cache/save@v4
         with:
-          key: cpm-cache-00-${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
+          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
           enableCrossOsArchive: true
           path: cpm
   cmake-ci:
@@ -97,18 +97,16 @@ jobs:
       - name: Restore Cache
         uses: actions/cache/restore@v4
         with:
-          key: cpm-cache-00-${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
+          key: cpm-cache-00-${{ runner.os == 'windows' && 'windows-' || '' }}${{ hashFiles('CMakeLists.txt', 'cmake/*') }}
           enableCrossOsArchive: true
           path: cpm
       - name: Download requirements.txt
         uses: actions/download-artifact@v4
         with:
           name: requirements
-      - name: Run sccache-cache only on non-release runs
-        if: github.event_name != 'release' && github.event_name != 'workflow_dispatch'
+      - name: Run sccache-cache
         uses: mozilla-actions/sccache-action@v0.0.7
-      - name: Set caching env vars only on non-release runs
-        if: github.event_name != 'release' && github.event_name != 'workflow_dispatch'
+      - name: Set caching env vars
         run: |
           echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
           echo "SCCACHE_GHA_VERSION=0" >> $GITHUB_ENV
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d21cfc3f..f64bd7812 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,7 +173,7 @@ if(FINUFFT_USE_CPU)
     endif()
     set(FFTW_VERSION 3.3.10)
     set(XTL_VERSION 0.7.7)
-    set(XSIMD_VERSION 13.1.0)
+    set(XSIMD_VERSION 13.2.0)
     set(DUCC0_VERSION ducc0_0_36_0)
     set(FINUFFT_FFTW_LIBRARIES)
     include(cmake/setupXSIMD.cmake)
diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h
index 5be96eae9..4d8de4283 100644
--- a/include/finufft/finufft_core.h
+++ b/include/finufft/finufft_core.h
@@ -1,6 +1,11 @@
 #ifndef FINUFFT_CORE_H
 #define FINUFFT_CORE_H
 
+#include <xsimd/xsimd.hpp>
+
+#include <finufft_errors.h>
+#include <memory>
+
 /* IMPORTANT: for Windows compilers, you should add a line
         #define FINUFFT_DLL
    here if you are compiling/using FINUFFT as a DLL,
@@ -59,10 +64,6 @@
 #define FINUFFT_LIKELY(x)   (x)
 #endif
 
-#include <finufft_errors.h>
-#include <memory>
-#include <xsimd/xsimd.hpp>
-
 // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
 // This includes all calling arguments (eg M,N) that could be huge someday.
 using BIGINT  = int64_t;
diff --git a/include/finufft/finufft_utils.hpp b/include/finufft/finufft_utils.hpp
index 7577a57a1..ff495b491 100644
--- a/include/finufft/finufft_utils.hpp
+++ b/include/finufft/finufft_utils.hpp
@@ -12,7 +12,6 @@
 //  using chrono since the interface is portable between linux and windows
 
 namespace finufft::utils {
-
 template<typename T>
 FINUFFT_EXPORT FINUFFT_ALWAYS_INLINE void FINUFFT_CDECL arrayrange(BIGINT n, const T *a,
                                                                    T *lo, T *hi)
@@ -59,8 +58,9 @@ class FINUFFT_EXPORT CNTime {
   double initial;
 };
 
-// openmp helpers
-int get_num_threads_parallel_block();
+FINUFFT_NEVER_INLINE int getOptimalThreadCount();
+
+FINUFFT_NEVER_INLINE int get_num_threads_parallel_block();
 
 } // namespace finufft::utils
 
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
index 0d8c36042..e1d56261b 100644
--- a/include/finufft_errors.h
+++ b/include/finufft_errors.h
@@ -26,5 +26,6 @@ enum {
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
   FINUFFT_ERR_INVALID_ARGUMENT       = 21,
   FINUFFT_ERR_LOCK_FUNS_INVALID      = 22,
+  FINUFFT_ERR_NTHREADS_NOTVALID      = 23,
 };
 #endif
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index 6342c7856..94d0fcdc2 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -592,17 +592,18 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
 
 #ifdef _OPENMP
   // choose overall # threads...
-  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
+  int ompmaxnthr = getOptimalThreadCount();
+  int nthr       = ompmaxnthr; // default: use as many physical cores as possible
   // (the above could be set, or suggested set, to 1 for small enough problems...)
   if (opts.nthreads > 0) {
     nthr = opts.nthreads; // user override, now without limit
     if (opts.showwarn && (nthr > ompmaxnthr))
       fprintf(stderr,
-              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
+              "%s warning: using opts.nthreads=%d, more than the %d physically cores "
               "available; note large nthreads can be slower.\n",
               __func__, nthr, ompmaxnthr);
   }
+
 #else
   int nthr = 1; // always 1 thread (avoid segfault)
   if (opts.nthreads > 1)
@@ -611,6 +612,17 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
             __func__, opts.nthreads);
 #endif
   opts.nthreads = nthr; // store actual # thr planned for
+  if (opts.debug > 1) {
+    printf("[%s] opts.nthreads=%d\n", __func__, nthr);
+  }
+
+  if (opts.nthreads == 0) {
+    fprintf(stderr,
+            "[%s] error: detecting physical corers failed. Please specify the number "
+            "of cores to use\n",
+            __func__);
+    throw int(FINUFFT_ERR_NTHREADS_NOTVALID);
+  }
   // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
 
   // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
diff --git a/src/finufft_utils.cpp b/src/finufft_utils.cpp
index 8bcf8ddab..15a22d7ab 100644
--- a/src/finufft_utils.cpp
+++ b/src/finufft_utils.cpp
@@ -3,14 +3,28 @@
 
 // For self-test see ../test/testutils.cpp.      Barnett 2017-2020.
 
-#include <cstdint>
-
 #include <finufft/finufft_utils.hpp>
 
-using namespace std;
+#include <cstdint>
+#include <iostream>
+#include <string>
 
-namespace finufft::utils {
+#if defined(_WIN32)
+#include <vector>
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#elif defined(__linux__)
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE // Enable GNU extensions for sched_getaffinity
+#endif
+#include <fstream>
+#include <sched.h>
+#include <set>
+#endif
 
+namespace finufft::utils {
 BIGINT next235even(BIGINT n)
 // finds even integer not less than n, with prime factors no larger than 5
 // (ie, "smooth"). Adapted from fortran in hellskitchen.  Barnett 2/9/17
@@ -57,6 +71,181 @@ double CNTime::elapsedsec() const
   return nowsec - initial;
 }
 
+namespace {
+#if defined(_WIN32)
+// Returns the number of physical CPU cores on Windows (excluding hyper-threaded cores)
+static int getPhysicalCoreCount() {
+  int physicalCoreCount = 0;
+
+  // Determine the required buffer size.
+  DWORD bufferSize = 0;
+  if (GetLogicalProcessorInformation(nullptr, &bufferSize) == FALSE &&
+      GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    return physicalCoreCount;
+  }
+
+  // Calculate the number of entries and allocate a vector.
+  size_t entryCount = bufferSize / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+  std::vector<SYSTEM_LOGICAL_PROCESSOR_INFORMATION> procInfo(entryCount);
+  if (GetLogicalProcessorInformation(procInfo.data(), &bufferSize) != FALSE) {
+    for (const auto &info : procInfo) {
+      if (info.Relationship == RelationProcessorCore) ++physicalCoreCount;
+    }
+  }
+
+  if (physicalCoreCount == 0) {
+    return MY_OMP_GET_MAX_THREADS();
+  }
+  return physicalCoreCount;
+}
+
+static int getAllowedCoreCount() {
+  DWORD_PTR processMask, systemMask;
+  if (!GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask)) {
+    return 0; // API call failed (should rarely happen for the current process)
+  }
+  // Count bits in processMask
+  int count = 0;
+  while (processMask) {
+    count += static_cast<int>(processMask & 1U);
+    processMask >>= 1;
+  }
+  return count;
+}
+
+#elif defined(__APPLE__)
+
+// Returns the number of physical CPU cores on macOS (excluding hyper-threaded cores)
+static int getPhysicalCoreCount() {
+  int physicalCoreCount = 0;
+  int cores             = 0;
+  size_t size           = sizeof(cores);
+  if (sysctlbyname("hw.physicalcpu", &cores, &size, nullptr, 0) == 0) {
+    physicalCoreCount = cores;
+  }
+
+  if (physicalCoreCount == 0) {
+    return MY_OMP_GET_MAX_THREADS();
+  }
+  return physicalCoreCount;
+}
+
+static int getAllowedCoreCount() {
+  // MacOS does not support CPU affinity, so we return the maximum number of threads.
+  return MY_OMP_GET_MAX_THREADS();
+}
+
+#elif defined(__linux__)
+// Returns the number of physical CPU cores on Linux (excluding hyper-threaded cores)
+static int getPhysicalCoreCount() {
+  int physicalCoreCount = 0;
+  std::ifstream cpuinfo("/proc/cpuinfo");
+  if (!cpuinfo.is_open()) return MY_OMP_GET_MAX_THREADS();
+
+  std::set<std::string> coreSet;
+  std::string line;
+  int physicalId = -1, coreId = -1;
+  bool foundPhysical = false, foundCore = false;
+
+  while (std::getline(cpuinfo, line)) {
+    // An empty line indicates the end of a processor block.
+    if (line.empty()) {
+      if (foundPhysical && foundCore)
+        coreSet.insert(std::to_string(physicalId) + "-" + std::to_string(coreId));
+      // Reset for the next processor block.
+      foundPhysical = foundCore = false;
+      physicalId = coreId = -1;
+    } else {
+      auto colonPos = line.find(':');
+      if (colonPos == std::string::npos) continue;
+      std::string key   = line.substr(0, colonPos);
+      std::string value = line.substr(colonPos + 1);
+      // Trim whitespace.
+      key.erase(key.find_last_not_of(" \t") + 1);
+      value.erase(0, value.find_first_not_of(" \t"));
+
+      if (key == "physical id") {
+        physicalId    = std::stoi(value);
+        foundPhysical = true;
+      } else if (key == "core id") {
+        coreId    = std::stoi(value);
+        foundCore = true;
+      }
+    }
+  }
+  // In case the file doesn't end with an empty line.
+  if (foundPhysical && foundCore)
+    coreSet.insert(std::to_string(physicalId) + "-" + std::to_string(coreId));
+
+  if (!coreSet.empty()) {
+    physicalCoreCount = static_cast<int>(coreSet.size());
+  } else {
+    // Fallback: try reading "cpu cores" from the first processor block.
+    cpuinfo.clear();
+    cpuinfo.seekg(0, std::ios::beg);
+    while (std::getline(cpuinfo, line)) {
+      auto colonPos = line.find(':');
+      if (colonPos != std::string::npos) {
+        std::string key   = line.substr(0, colonPos);
+        std::string value = line.substr(colonPos + 1);
+        key.erase(key.find_last_not_of(" \t") + 1);
+        value.erase(0, value.find_first_not_of(" \t"));
+        if (key == "cpu cores") {
+          physicalCoreCount = std::stoi(value);
+          break;
+        }
+      }
+    }
+  }
+
+  if (physicalCoreCount == 0) {
+    return MY_OMP_GET_MAX_THREADS();
+  }
+  return physicalCoreCount;
+}
+
+static int getAllowedCoreCount() {
+  cpu_set_t cpuSet;
+  CPU_ZERO(&cpuSet);
+  if (sched_getaffinity(0, sizeof(cpu_set_t), &cpuSet) != 0) {
+    return 0; // Error (e.g., not supported or failed)
+  }
+  int count = 0;
+  for (int cpu = 0; cpu < CPU_SETSIZE; ++cpu) {
+    if (CPU_ISSET(cpu, &cpuSet)) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+#else
+
+#warning "Unknown platform. Impossible to detect the number of physical cores."
+// Fallback version if none of the above platforms is detected.
+static int getPhysicalCoreCount() { return MY_OMP_GET_MAX_THREADS(); }
+static int getAllowedCoreCount() { return MY_OMP_GET_MAX_THREADS(); }
+
+#endif
+
+} // namespace
+
+int getOptimalThreadCount() {
+  // if the user has set the OMP_NUM_THREADS environment variable, use that value
+  const auto OMP_THREADS = std::getenv("OMP_NUM_THREADS");
+  if (OMP_THREADS) {
+    return std::stoi(OMP_THREADS);
+  }
+  // otherwise, use the min between number of physical cores or the number of allowed
+  // cores (e.g. by taskset)
+  const auto physicalCores = getPhysicalCoreCount();
+  const auto allowedCores  = getAllowedCoreCount();
+  if (physicalCores < allowedCores) {
+    return physicalCores;
+  }
+  return allowedCores;
+}
+
 // -------------------------- openmp helpers -------------------------------
 int get_num_threads_parallel_block()
 // return how many threads an omp parallel block would use.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 7f2184fec..51c1de72d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -36,7 +36,7 @@ target_compile_features(testutils PRIVATE cxx_std_17)
 finufft_link_test(testutils)
 add_test(NAME run_testutils COMMAND testutils WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
-if(NOT FINUFFT_USE_DUCC0)
+if(NOT FINUFFT_USE_DUCC0 AND FINUFFT_USE_OPENMP)
     add_executable(fftw_lock_test fftw_lock_test.cpp)
     target_compile_features(fftw_lock_test PRIVATE cxx_std_17)
     finufft_link_test(fftw_lock_test)