From 31d6c5dbd57b85c169564c4b582e578fd6915dba Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 15 Nov 2024 09:31:27 -0500
Subject: [PATCH 01/37] DOC v25.02 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  8 ++---
 .../cuda12.5-conda/devcontainer.json          |  6 ++--
 .devcontainer/cuda12.5-pip/devcontainer.json  |  8 ++---
 .github/workflows/build.yaml                  | 16 +++++-----
 .github/workflows/pr.yaml                     | 28 ++++++++---------
 .github/workflows/test.yaml                   | 10 +++----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            | 14 ++++-----
 .../all_cuda-118_arch-x86_64.yaml             | 14 ++++-----
 .../all_cuda-125_arch-aarch64.yaml            | 14 ++++-----
 .../all_cuda-125_arch-x86_64.yaml             | 14 ++++-----
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  4 +--
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  4 +--
 .../bench_ann_cuda-120_arch-aarch64.yaml      |  4 +--
 .../bench_ann_cuda-120_arch-x86_64.yaml       |  4 +--
 .../recipes/raft-dask/conda_build_config.yaml |  4 +--
 .../cmake/thirdparty/fetch_rapids.cmake       |  2 +-
 dependencies.yaml                             | 30 +++++++++----------
 docs/source/build.md                          |  2 +-
 docs/source/developer_guide.md                |  6 ++--
 docs/source/raft_ann_benchmarks.md            | 12 ++++----
 python/pylibraft/pyproject.toml               |  4 +--
 .../raft-dask/cmake/thirdparty/get_ucxx.cmake |  4 +--
 python/raft-dask/pyproject.toml               | 10 +++----
 26 files changed, 116 insertions(+), 116 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 008bf8730a..8c857961c2 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 75aed80f9f..c691ed6007 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 240ba02131..dc4fcd02fd 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index c23c79017a..bc43900ef3 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index db379c9d47..7879f22879 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       package-name: pylibraft
   wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index fe8e730921..e349b25ce6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,13 +26,13 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
     with:
       files_yaml: |
         test_cpp:
@@ -65,27 +65,27 @@ jobs:
           - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -93,20 +93,20 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -116,14 +116,14 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: [wheel-build-pylibraft, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -131,21 +131,21 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: [wheel-build-raft-dask, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2bee8a3d1d..1ae093bc56 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 8870e9385e..8d16fc5842 100755
--- a/README.md
+++ b/README.md
@@ -255,7 +255,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.12/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-25.02/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
diff --git a/VERSION b/VERSION
index af28c42b52..72eefaf7c7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.12.00
+25.02.00
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 6098cd12bf..269af03e9f 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,8 +20,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
 - numba>=0.57
@@ -44,18 +44,18 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
+- ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0fe8fbab39..4c7150264b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,8 +20,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
 - numba>=0.57
@@ -44,18 +44,18 @@ dependencies:
 - nvcc_linux-64=11.8
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
+- ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index dfb9ac0b97..648a5a00f0 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -21,8 +21,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
 - numba>=0.57
@@ -40,18 +40,18 @@ dependencies:
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
+- ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index bf6f5d6462..7d7b9c4454 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,8 +21,8 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
 - numba>=0.57
@@ -40,18 +40,18 @@ dependencies:
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
+- ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-125_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 39bdf2671d..777d2ddb7f 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -30,7 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -40,7 +40,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 56004fa818..7fa432c8d6 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -30,7 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -40,7 +40,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 5f0599d9ae..0f59fc6090 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -36,7 +36,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 849e6c1412..273d6a9f9b 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -36,7 +36,7 @@ dependencies:
 - pandas
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index d7d2f68b42..68140e6bc0 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,10 +17,10 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.41.*"
+  - "0.42.*"
 
 ucxx_version:
-  - "0.41.*"
+  - "0.42.*"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index 6f4c627ed4..23c8490b40 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.12")
+set(RAPIDS_VERSION "25.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/dependencies.yaml b/dependencies.yaml
index 7766481c99..fb58e93f71 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -171,7 +171,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - nccl>=2.19
-          - libucxx==0.41.*,>=0.0.0a0
+          - libucxx==0.42.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -210,7 +210,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_unsuffixed rmm==24.12.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -237,12 +237,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &rmm_cu12 rmm-cu12==24.12.*,>=0.0.0a0
+              - &rmm_cu12 rmm-cu12==25.2.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - &rmm_cu11 rmm-cu11==24.12.*,>=0.0.0a0
+              - &rmm_cu11 rmm-cu11==25.2.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_unsuffixed] }
   checks:
     common:
@@ -514,14 +514,14 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.12.*,>=0.0.0a0
+          - dask-cuda==25.2.*,>=0.0.0a0
           - joblib>=0.11
           - numba>=0.57
-          - rapids-dask-dependency==24.12.*,>=0.0.0a0
+          - rapids-dask-dependency==25.2.*,>=0.0.0a0
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0
-          - &ucx_py_unsuffixed ucx-py==0.41.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==25.2.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.42.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -535,14 +535,14 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &pylibraft_cu12 pylibraft-cu12==24.12.*,>=0.0.0a0
-              - &ucx_py_cu12 ucx-py-cu12==0.41.*,>=0.0.0a0
+              - &pylibraft_cu12 pylibraft-cu12==25.2.*,>=0.0.0a0
+              - &ucx_py_cu12 ucx-py-cu12==0.42.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.12.*,>=0.0.0a0
-              - &ucx_py_cu11 ucx-py-cu11==0.41.*,>=0.0.0a0
+              - &pylibraft_cu11 pylibraft-cu11==25.2.*,>=0.0.0a0
+              - &ucx_py_cu11 ucx-py-cu11==0.42.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_unsuffixed, *ucx_py_unsuffixed]}
   test_python_common:
     common:
@@ -562,7 +562,7 @@ dependencies:
         packages:
           # UCXX is not currently a hard-dependency thus only installed during tests,
           # this will change in the future.
-          - &distributed_ucxx_unsuffixed distributed-ucxx==0.41.*,>=0.0.0a0
+          - &distributed_ucxx_unsuffixed distributed-ucxx==0.42.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -575,12 +575,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - distributed-ucxx-cu12==0.41.*,>=0.0.0a0
+              - distributed-ucxx-cu12==0.42.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - distributed-ucxx-cu11==0.41.*,>=0.0.0a0
+              - distributed-ucxx-cu11==0.42.*,>=0.0.0a0
           - {matrix: null, packages: [*distributed_ucxx_unsuffixed]}
   depends_on_ucx_build:
     common:
diff --git a/docs/source/build.md b/docs/source/build.md
index b9a1832b02..0c4ab17ed0 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.12/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-25.02/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index c4a099fabb..5cc694dc8f 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 12a94e45ce..b7f7cc81d4 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -66,7 +66,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.12a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:25.02a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -87,7 +87,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.12-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:25.02-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -348,7 +348,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.12a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:25.02a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -359,7 +359,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.12a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:25.02a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -376,7 +376,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.12a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:25.02a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -393,7 +393,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.12a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:25.02a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index bb01602b33..3502d82fd4 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "nvidia-curand",
     "nvidia-cusolver",
     "nvidia-cusparse",
-    "rmm==24.12.*,>=0.0.0a0",
+    "rmm==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -125,7 +125,7 @@ requires = [
     "cuda-python",
     "cython>=3.0.0,<3.1.0a0",
     "ninja",
-    "rmm==24.12.*,>=0.0.0a0",
+    "rmm==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
index db9b5c6b4d..f5daf70f92 100644
--- a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -47,9 +47,9 @@ endfunction()
 # Change pinned tag here to test a commit in CI
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_ucxx(VERSION  0.41
+find_and_configure_ucxx(VERSION  0.42
         FORK             rapidsai
-        PINNED_TAG       branch-0.41
+        PINNED_TAG       branch-0.42
         EXCLUDE_FROM_ALL YES
         UCXX_STATIC      ${RAFT_DASK_UCXX_STATIC}
     )
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index a9f4de5dc3..33643c481e 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -31,13 +31,13 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "dask-cuda==24.12.*,>=0.0.0a0",
-    "distributed-ucxx==0.41.*,>=0.0.0a0",
+    "dask-cuda==25.2.*,>=0.0.0a0",
+    "distributed-ucxx==0.42.*,>=0.0.0a0",
     "joblib>=0.11",
     "numba>=0.57",
-    "pylibraft==24.12.*,>=0.0.0a0",
-    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
-    "ucx-py==0.41.*,>=0.0.0a0",
+    "pylibraft==25.2.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.2.*,>=0.0.0a0",
+    "ucx-py==0.42.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 4cdc1d80aa01c147a94eed9fbc68a38fba29eaf4 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:27:54 -0600
Subject: [PATCH 02/37] Add breaking change workflow trigger (#2482)

Adds a workflow that triggers a second workflow which sends a
notification to a designated Slack channel on every PR labelled with
breaking, whenever any of the following events are triggered on the PR:

- closed
- reopened
- labeled
- unlabeled

Depends on https://github.com/rapidsai/shared-workflows/pull/257
---
 .../trigger-breaking-change-alert.yaml        | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 .github/workflows/trigger-breaking-change-alert.yaml

diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
new file mode 100644
index 0000000000..3b972f31ca
--- /dev/null
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -0,0 +1,26 @@
+name: Trigger Breaking Change Notifications
+
+on:
+  pull_request_target:
+    types:
+      - closed
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  trigger-notifier:
+    if: contains(github.event.pull_request.labels.*.name, 'breaking')
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    with:
+      sender_login: ${{ github.event.sender.login }}
+      sender_avatar: ${{ github.event.sender.avatar_url }}
+      repo: ${{ github.repository }}
+      pr_number: ${{ github.event.pull_request.number }}
+      pr_title: "${{ github.event.pull_request.title }}"
+      pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}"
+      pr_base_ref: ${{ github.event.pull_request.base.ref }}
+      pr_author: ${{ github.event.pull_request.user.login }}
+      event_action: ${{ github.event.action }}
+      pr_merged: ${{ github.event.pull_request.merged }}

From adfd2f6f765b5a979742399e10581b75ba5a2834 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 26 Nov 2024 16:38:40 -0600
Subject: [PATCH 03/37] Require approval to run CI on draft PRs (#2512)

By default, CI runs on draft PRs. This leads to many CI runs that may be unnecessary.

With this PR's change to `.github/copy-pr-bot.yaml`, an `/ok to test` comment from a trusted user is required to trigger CI on draft PRs. Non-draft PRs will run CI by default, assuming that all commits are signed by trusted users. Otherwise an `/ok to test` is required (as before) -- see the `copy-pr-bot` docs at https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ for more information.

Part of https://github.com/rapidsai/build-planning/issues/123.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/raft/pull/2512
---
 .github/copy-pr-bot.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index 895ba83ee5..e0ea775aad 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -2,3 +2,4 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+auto_sync_draft: false

From c943181c4d48e5050f2b8c40f17e40155bfd9d61 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 26 Nov 2024 19:29:03 -0600
Subject: [PATCH 04/37] Shrink wheel size limit following removal of vector
 search APIs. (#2509)

Following #2498, we can apply this feedback from #2490: https://github.com/rapidsai/raft/pull/2490#discussion_r1841357165

These changes are inspired by https://github.com/rapidsai/cuvs/pull/469.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/raft/pull/2509
---
 ci/build_wheel_pylibraft.sh     |  2 +-
 ci/build_wheel_raft_dask.sh     |  2 +-
 ci/validate_wheel.sh            | 24 ++++++++++++++++++++++++
 python/pylibraft/pyproject.toml |  4 +---
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index dacaa1190e..dd62ab5399 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -18,4 +18,4 @@ esac
 export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF${EXTRA_CMAKE_ARGS}"
 
 ci/build_wheel.sh pylibraft ${package_dir}
-ci/validate_wheel.sh ${package_dir} final_dist
+ci/validate_wheel.sh ${package_dir} final_dist pylibraft
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
index e4f3f0a833..d49d131abf 100755
--- a/ci/build_wheel_raft_dask.sh
+++ b/ci/build_wheel_raft_dask.sh
@@ -9,4 +9,4 @@ package_dir="python/raft-dask"
 export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
 
 ci/build_wheel.sh raft-dask ${package_dir}
-ci/validate_wheel.sh ${package_dir} final_dist
+ci/validate_wheel.sh ${package_dir} final_dist raft-dask
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 5910a5c59f..5ef72ad895 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -5,6 +5,29 @@ set -euo pipefail
 
 package_dir=$1
 wheel_dir_relative_path=$2
+package_name=$3
+
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
+
+# some packages are much larger on CUDA 11 than on CUDA 12
+if [[ "${package_name}" == "raft-dask" ]]; then
+    PYDISTCHECK_ARGS=(
+        --max-allowed-size-compressed '200M'
+    )
+elif [[ "${package_name}" == "pylibraft" ]]; then
+    if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
+        PYDISTCHECK_ARGS=(
+            --max-allowed-size-compressed '600M'
+        )
+    else
+        PYDISTCHECK_ARGS=(
+            --max-allowed-size-compressed '100M'
+        )
+    fi
+else
+    echo "Unsupported package name: ${package_name}"
+    exit 1
+fi
 
 cd "${package_dir}"
 
@@ -12,6 +35,7 @@ rapids-logger "validate packages with 'pydistcheck'"
 
 pydistcheck \
     --inspect \
+    "${PYDISTCHECK_ARGS[@]}" \
     "$(echo ${wheel_dir_relative_path}/*.whl)"
 
 rapids-logger "validate packages with 'twine'"
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 3502d82fd4..ba454af591 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -132,12 +132,10 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
 [tool.pydistcheck]
 select = [
+    # NOTE: size threshold is managed via CLI args in CI scripts
     "distro-too-large-compressed",
 ]
 
-# detect when package size grows significantly
-max_allowed_size_compressed = '825M'
-
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",

From 0e6d35f7cbb4354641d69868d6cb10dcee21fbca Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 27 Nov 2024 12:36:38 -0800
Subject: [PATCH 05/37] Adapt to rmm logger changes (#2513)

This PR adapts to breaking changes in rmm in https://github.com/rapidsai/rmm/pull/1722.

This PR is a breaking change because consumers of raft that use any functionality that touches rmm logging will need to link to the rmm::rmm_logger_impl target as well now.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2513
---
 cpp/CMakeLists.txt                    | 13 ++++++++++---
 cpp/bench/prims/CMakeLists.txt        |  4 ++++
 cpp/cmake/thirdparty/get_spdlog.cmake |  6 +++---
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 780f6f8581..78a4dbb913 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -180,7 +180,10 @@ target_include_directories(
 )
 
 # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target.
-target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass CCCL::CCCL)
+target_link_libraries(
+  raft INTERFACE rmm::rmm rmm::rmm_logger spdlog::spdlog_header_only cuco::cuco
+                 nvidia::cutlass::cutlass CCCL::CCCL
+)
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 target_compile_options(
@@ -288,8 +291,10 @@ if(RAFT_COMPILE_LIBRARY)
                       "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  add_library(raft_lib SHARED $<TARGET_OBJECTS:raft_objs>)
-  add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
+  # Make sure not to add the rmm logger twice since it will be brought in as an interface source by
+  # the rmm::rmm_logger_impl target.
+  add_library(raft_lib SHARED $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
+  add_library(raft_lib_static STATIC $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
 
   set_target_properties(
     raft_lib raft_lib_static
@@ -313,6 +318,8 @@ if(RAFT_COMPILE_LIBRARY)
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
     target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
   endforeach()
+  target_link_libraries(raft_lib PRIVATE rmm::rmm_logger_impl)
+  target_link_libraries(raft_lib_static PRIVATE rmm::rmm_logger_impl)
 endif()
 
 if(TARGET raft_lib AND (NOT TARGET raft::raft_lib))
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index cf03a36612..edc1af4e02 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -32,6 +32,7 @@ function(ConfigureBench)
     PRIVATE raft::raft
             raft_internal
             $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
+            $<$<NOT:$<BOOL:${ConfigureBench_LIB}>>:bench_rmm_logger>
             ${RAFT_CTK_MATH_DEPENDENCIES}
             benchmark::benchmark
             Threads::Threads
@@ -73,6 +74,9 @@ function(ConfigureBench)
 
 endfunction()
 
+add_library(bench_rmm_logger OBJECT)
+target_link_libraries(bench_rmm_logger PRIVATE rmm::rmm_logger_impl)
+
 if(BUILD_PRIMS_BENCH)
   ConfigureBench(NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp)
 
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index 57e38c2638..b1ffbe246f 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -16,9 +16,9 @@
 function(find_and_configure_spdlog)
 
     include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-    rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports)
-    rapids_export_package(BUILD spdlog rmm-exports)
+    rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET raft-exports)
+    rapids_export_package(BUILD spdlog raft-exports)
 
 endfunction()
 
-find_and_configure_spdlog()
\ No newline at end of file
+find_and_configure_spdlog()

From fc7818f078a69393e8a0cb27c117b19208c76aaf Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 4 Dec 2024 10:48:49 -0600
Subject: [PATCH 06/37] prefer system install of UCX in devcontainers, update
 outdated RAPIDS references (#2514)

Contributes to https://github.com/rapidsai/build-planning/issues/118

Proposes the following changes for pip devcontainers:

* prefer system installation of ucx to the one provided by the `libucx-cu{11,12}` wheels (ref: https://github.com/rapidsai/devcontainers/pull/421#issuecomment-2502324982)

And some other related changes noticed while doing that:

* update lingering `24.*` references to `25.02`

## Notes for Reviewers

### How I tested this

Relying on CI for most things. Double-checked that `update-version.sh` would have caught the one lingering `24.12` reference like this:

```shell
./ci/release/update-version.sh '25.02.00'
git grep -E '24\.'
```

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2514
---
 .devcontainer/Dockerfile                             | 1 +
 .github/workflows/trigger-breaking-change-alert.yaml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index dc12ab2ade..0f6a8b46af 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,7 @@ RUN apt update -y \
  && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
 
 ENV DEFAULT_VIRTUAL_ENV=rapids
+ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
 
 FROM ${BASE} as conda-base
 
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 3b972f31ca..01dd2436be 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From 3ce5b6ad45946a9c790711addb7b5d358534d8d9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 Dec 2024 18:14:06 -0600
Subject: [PATCH 07/37] Remove upper bounds on cuda-python to allow 12.6.2 and
 11.8.5 (#2517)

Now that some upstream bugs have been fixed, we can allow cuda-python 12.6.2 and 11.8.5.

See https://github.com/NVIDIA/cuda-python/issues/226#issuecomment-2472355738 for more information.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2517
---
 conda/environments/all_cuda-118_arch-aarch64.yaml | 3 ++-
 conda/environments/all_cuda-118_arch-x86_64.yaml  | 3 ++-
 conda/environments/all_cuda-125_arch-aarch64.yaml | 3 ++-
 conda/environments/all_cuda-125_arch-x86_64.yaml  | 3 ++-
 conda/recipes/pylibraft/meta.yaml                 | 8 ++++----
 conda/recipes/raft-dask/meta.yaml                 | 8 ++++----
 cpp/cmake/thirdparty/get_rmm.cmake                | 2 +-
 dependencies.yaml                                 | 7 ++++---
 8 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 269af03e9f..f8201cbccf 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -14,7 +14,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
@@ -54,6 +54,7 @@ dependencies:
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4c7150264b..66b97854ab 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,7 +14,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
@@ -54,6 +54,7 @@ dependencies:
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 648a5a00f0..1fd6edfb6f 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
@@ -50,6 +50,7 @@ dependencies:
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7d7b9c4454..72108fed48 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
@@ -50,6 +50,7 @@ dependencies:
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 01a9d61f0f..f1edf5d767 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -43,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -61,10 +61,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 02a8957b06..14ffa5c092 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -43,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -62,10 +62,10 @@ requirements:
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - dask-cuda ={{ minor_version }}
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 5a7d54ea4a..0e93363039 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports
-                   INSTALL_EXPORT_SET  raft-exports)
+                   INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/dependencies.yaml b/dependencies.yaml
index daef3ad2ea..80c7f29447 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -143,8 +143,9 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.19
           - libucxx==0.42.*,>=0.0.0a0
+          - nccl>=2.19
+          - spdlog>=1.14.1,<1.15
     specific:
       - output_types: conda
         matrices:
@@ -196,11 +197,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
           - matrix:
             packages:
               - &cuda_python cuda-python

From ee45ce786686b54d1972408b927d7fcd8ce0cf20 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sat, 7 Dec 2024 00:35:26 -0600
Subject: [PATCH 08/37] Update cuda-python lower bounds to 12.6.2 / 11.8.5
 (#2522)

We require a newer cuda-python lower bound for new features and to use the new layout.
This will fix a number of errors observed when the runtime version of cuda-python is older than the version used to build packages using Cython features from cuda-python.

See https://github.com/rapidsai/build-planning/issues/117#issuecomment-2524250915 for details.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2522
---
 conda/environments/all_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml  | 2 +-
 conda/recipes/pylibraft/meta.yaml                 | 8 ++++----
 conda/recipes/raft-dask/meta.yaml                 | 8 ++++----
 dependencies.yaml                                 | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index f8201cbccf..e145aeb92e 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -14,7 +14,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 66b97854ab..75dcffa95d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,7 +14,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 1fd6edfb6f..bfa32c80d1 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 72108fed48..98ec334635 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -16,7 +16,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index f1edf5d767..4a8ed29c85 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -43,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -61,10 +61,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 14ffa5c092..a8be273f82 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -43,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -62,10 +62,10 @@ requirements:
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - dask-cuda ={{ minor_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 80c7f29447..37ea223a01 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -197,11 +197,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.6.2,<13.0a0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.8.5,<12.0a0
           - matrix:
             packages:
               - &cuda_python cuda-python

From 1e5030d1b4f85a9f306c36f8a030494fa59aaaa4 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 11 Dec 2024 23:39:15 +0100
Subject: [PATCH 09/37] Fix rnd bit generation in rmat_rectangular_kernel
 (#2524)

For certain architectures, the compiler always generates zero destination bit in the following loop https://github.com/rapidsai/raft/blob/ee45ce786686b54d1972408b927d7fcd8ce0cf20/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh#L160-L162 irrespective of the random value that shall determine which bit to use for `dst_id`.

This PR refactors the loop. This way the `dst_id` number has the desired random distribution for all bits.

Authors:
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2524
---
 .../detail/rmat_rectangular_generator.cuh       | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index 9ad7c68f87..24207ba6db 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -151,15 +151,16 @@ RAFT_KERNEL rmat_gen_kernel(IdxT* out,
   raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0};
   auto min_scale = min(r_scale, c_scale);
   IdxT i         = 0;
-  for (; i < min_scale; ++i) {
-    gen_and_update_bits(src_id, dst_id, a, a + b, a + b + c, r_scale, c_scale, i, gen);
-  }
-  for (; i < r_scale; ++i) {
-    gen_and_update_bits(src_id, dst_id, a + b, a + b, ProbT(1), r_scale, c_scale, i, gen);
-  }
-  for (; i < c_scale; ++i) {
-    gen_and_update_bits(src_id, dst_id, a + c, ProbT(1), ProbT(1), r_scale, c_scale, i, gen);
+  // Whether we have more rows than columns.
+  const bool more_rows = r_scale > c_scale;
+
+  for (; i < max_scale; ++i) {
+    ProbT A   = (i < min_scale) ? a : (more_rows ? a + b : a + c);
+    ProbT AB  = (i < min_scale) ? a + b : (more_rows ? a + b : ProbT(1));
+    ProbT ABC = (i < min_scale) ? a + b + c : ProbT(1);
+    gen_and_update_bits(src_id, dst_id, A, AB, ABC, r_scale, c_scale, i, gen);
   }
+
   store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges);
 }
 

From 3720d8e91c21ec95d3dbe8e0d1a4515eb60fa7fa Mon Sep 17 00:00:00 2001
From: rhdong <rhdong2017@gmail.com>
Date: Wed, 11 Dec 2024 14:41:14 -0800
Subject: [PATCH 10/37] [Opt] Optimizing the performance of `bitmap_to_csr`
 (#2516)

This PR optimizes the performance of `bitmap_to_csr` related kernels by 14~1000 times. It could also benefit the `bitset_to_csr` in the future.

#### After (Updated Dec 08)

```shell
---------------------------------------------------------------------------------------------------
Benchmark                                                         Time             CPU   Iterations
---------------------------------------------------------------------------------------------------
BitmapToCsrBench<uint32_t, int64_t, float>/0/manual_time      0.161 ms        0.197 ms         4350  rows*cols=1*100000000	sparsity=0.95
BitmapToCsrBench<uint32_t, int64_t, float>/1/manual_time      0.110 ms        0.147 ms         6363  rows*cols=1*100000000	sparsity=0.99
BitmapToCsrBench<uint32_t, int64_t, float>/2/manual_time       14.2 ms         14.2 ms           50  rows*cols=100*100000000	sparsity=0.95
BitmapToCsrBench<uint32_t, int64_t, float>/3/manual_time       8.76 ms         8.80 ms           80  rows*cols=100*100000000	sparsity=0.99
```

#### Before

```shell
---------------------------------------------------------------------------------------------------
Benchmark                                                         Time             CPU   Iterations
---------------------------------------------------------------------------------------------------
BitmapToCsrBench<uint32_t, int64_t, float>/0/manual_time        176 ms          176 ms            4  rows*cols=1*100000000	sparsity=0.95
BitmapToCsrBench<uint32_t, int64_t, float>/1/manual_time        146 ms          146 ms            5  rows*cols=1*100000000	sparsity=0.99
BitmapToCsrBench<uint32_t, int64_t, float>/2/manual_time        180 ms          180 ms            4  rows*cols=100*100000000	sparsity=0.95
BitmapToCsrBench<uint32_t, int64_t, float>/3/manual_time        148 ms          148 ms            5  rows*cols=100*100000000	sparsity=0.99
```

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2516
---
 cpp/bench/prims/sparse/bitmap_to_csr.cu       |  26 +-
 .../sparse/convert/detail/bitmap_to_csr.cuh   | 358 ++++++++++--------
 cpp/include/raft/util/device_loads_stores.cuh |  42 ++
 cpp/test/sparse/convert_csr.cu                |  78 ++--
 4 files changed, 319 insertions(+), 185 deletions(-)

diff --git a/cpp/bench/prims/sparse/bitmap_to_csr.cu b/cpp/bench/prims/sparse/bitmap_to_csr.cu
index ed53df3265..71aabb1bf9 100644
--- a/cpp/bench/prims/sparse/bitmap_to_csr.cu
+++ b/cpp/bench/prims/sparse/bitmap_to_csr.cu
@@ -71,7 +71,7 @@ struct BitmapToCsrBench : public fixture {
   index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
     for (auto& item : bitmap) {
@@ -141,7 +141,27 @@ const std::vector<bench_param<index_t>> getInputs()
   };
 
   const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
-    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.01f, 0.1f, 0.2f, 0.5f});
+    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+template <typename index_t = int64_t>
+const std::vector<bench_param<index_t>> getLargeInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
@@ -153,4 +173,6 @@ const std::vector<bench_param<index_t>> getInputs()
 RAFT_BENCH_REGISTER((BitmapToCsrBench<uint32_t, int, float>), "", getInputs<int>());
 RAFT_BENCH_REGISTER((BitmapToCsrBench<uint64_t, int, double>), "", getInputs<int>());
 
+RAFT_BENCH_REGISTER((BitmapToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());
+
 }  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index 769d5de9be..866923d647 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -21,6 +21,7 @@
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
+#include <raft/util/device_loads_stores.cuh>
 
 #include <rmm/device_uvector.hpp>
 
@@ -41,61 +42,68 @@ namespace sparse {
 namespace convert {
 namespace detail {
 
-// Threads per block in calc_nnz_by_rows_kernel.
-static const constexpr int calc_nnz_by_rows_tpb = 32;
+// Threads per block in bitmap_to_csr.
+static const constexpr int bitmap_to_csr_tpb = 256;
 
 template <typename bitmap_t, typename index_t, typename nnz_t>
-RAFT_KERNEL __launch_bounds__(calc_nnz_by_rows_tpb) calc_nnz_by_rows_kernel(const bitmap_t* bitmap,
-                                                                            index_t num_rows,
-                                                                            index_t num_cols,
-                                                                            index_t bitmap_num,
-                                                                            nnz_t* nnz_per_row)
+RAFT_KERNEL __launch_bounds__(bitmap_to_csr_tpb) calc_nnz_by_rows_kernel(const bitmap_t* bitmap,
+                                                                         index_t num_rows,
+                                                                         index_t num_cols,
+                                                                         index_t bitmap_num,
+                                                                         nnz_t* sub_col_nnz,
+                                                                         index_t bits_per_sub_col)
 {
-  constexpr bitmap_t FULL_MASK      = ~bitmap_t(0u);
-  constexpr bitmap_t ONE            = bitmap_t(1u);
+  using mutable_bitmap_t = typename std::remove_const_t<bitmap_t>;
+  using BlockReduce      = cub::BlockReduce<index_t, bitmap_to_csr_tpb>;
+
+  __shared__ typename BlockReduce::TempStorage reduce_storage;
+
   constexpr index_t BITS_PER_BITMAP = sizeof(bitmap_t) * 8;
 
-  auto block = cg::this_thread_block();
-  auto tile  = cg::tiled_partition<32>(block);
+  const auto tid = threadIdx.x;
+  const auto row = blockIdx.x;
 
-  int lane_id = threadIdx.x & 0x1f;
+  const auto num_sub_cols = gridDim.y;
+  const auto sub_col      = blockIdx.y;
 
-  for (index_t row = blockIdx.x; row < num_rows; row += gridDim.x) {
-    index_t offset = 0;
-    index_t s_bit  = row * num_cols;
-    index_t e_bit  = s_bit + num_cols;
-    index_t l_sum  = 0;
+  size_t s_bit = size_t(row) * num_cols + sub_col * bits_per_sub_col;
+  size_t e_bit = min(s_bit + bits_per_sub_col, size_t(num_cols) * (row + 1));
 
-    int s_gap = 0;
-    int e_gap = 0;
+  nnz_t l_sum = 0;
+  nnz_t g_sum = 0;
 
-    while (offset < num_cols) {
-      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      std::remove_const_t<bitmap_t> l_bitmap = 0;
+  index_t s_offset  = s_bit % BITS_PER_BITMAP;
+  size_t bitmap_idx = s_bit / BITS_PER_BITMAP;
 
-      if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
+  if (tid == 0 && s_offset != 0) {
+    mutable_bitmap_t l_bitmap = bitmap[bitmap_idx];
 
-      offset += BITS_PER_BITMAP * warpSize;
+    l_bitmap >>= s_offset;
 
-      s_gap = s_bit - bitmap_idx * BITS_PER_BITMAP;
-      if (s_gap > 0) {
-        l_bitmap >>= s_gap;
-        l_bitmap <<= s_gap;
-        offset -= s_gap;
-      }
+    size_t remaining_bits = min(size_t(BITS_PER_BITMAP - s_offset), e_bit - s_bit);
 
-      e_gap = (bitmap_idx + 1) * BITS_PER_BITMAP - e_bit;
-      if (e_gap > 0) {
-        l_bitmap <<= e_gap;
-        l_bitmap >>= e_gap;
-      }
-      l_sum += static_cast<index_t>(raft::detail::popc(l_bitmap));
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
     }
+    l_sum += static_cast<nnz_t>(raft::detail::popc(l_bitmap));
+  }
+  if (s_offset != 0) { s_bit += (BITS_PER_BITMAP - s_offset); }
 
-    l_sum = cg::reduce(tile, l_sum, cg::plus<index_t>());
+  for (size_t bit_idx = s_bit; bit_idx < e_bit; bit_idx += BITS_PER_BITMAP * blockDim.x) {
+    mutable_bitmap_t l_bitmap = 0;
+    bitmap_idx                = bit_idx / BITS_PER_BITMAP + tid;
 
-    if (lane_id == 0) { *(nnz_per_row + row) += static_cast<nnz_t>(l_sum); }
+    index_t remaining_bits = min(BITS_PER_BITMAP, index_t(e_bit - bitmap_idx * BITS_PER_BITMAP));
+
+    if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
+
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
+    }
+    l_sum += static_cast<nnz_t>(raft::detail::popc(l_bitmap));
   }
+  g_sum = BlockReduce(reduce_storage).Reduce(l_sum, cub::Sum());
+  stg(g_sum, sub_col_nnz + sub_col + row * num_sub_cols, tid == 0);
 }
 
 template <typename bitmap_t, typename index_t, typename nnz_t>
@@ -103,144 +111,164 @@ void calc_nnz_by_rows(raft::resources const& handle,
                       const bitmap_t* bitmap,
                       index_t num_rows,
                       index_t num_cols,
-                      nnz_t* nnz_per_row)
+                      nnz_t* sub_col_nnz,
+                      size_t& sub_nnz_size,
+                      index_t& bits_per_sub_col)
 {
-  auto stream              = resource::get_cuda_stream(handle);
-  const index_t total      = num_rows * num_cols;
-  const index_t bitmap_num = raft::ceildiv(total, index_t(sizeof(bitmap_t) * 8));
-
-  int dev_id, sm_count, blocks_per_sm;
+  if (sub_nnz_size == 0) {
+    bits_per_sub_col = bitmap_to_csr_tpb * sizeof(index_t) * 8 * 8;
+    auto grid_dim_y  = (num_cols + bits_per_sub_col - 1) / bits_per_sub_col;
+    sub_nnz_size     = num_rows * ((num_cols + bits_per_sub_col - 1) / bits_per_sub_col);
+    return;
+  }
+  auto stream        = resource::get_cuda_stream(handle);
+  const size_t total = num_rows * num_cols;
+  const size_t bitmap_num =
+    (total + index_t(sizeof(bitmap_t) * 8) - 1) / index_t(sizeof(bitmap_t) * 8);
 
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm, calc_nnz_by_rows_kernel<bitmap_t, index_t, nnz_t>, calc_nnz_by_rows_tpb, 0);
+  auto block_x = num_rows;
+  auto block_y = sub_nnz_size / num_rows;
+  dim3 grid(block_x, block_y, 1);
 
-  index_t max_active_blocks = sm_count * blocks_per_sm;
-  auto grid = std::min(max_active_blocks, raft::ceildiv(bitmap_num, index_t(calc_nnz_by_rows_tpb)));
-  auto block = calc_nnz_by_rows_tpb;
+  auto block = bitmap_to_csr_tpb;
 
-  calc_nnz_by_rows_kernel<bitmap_t, index_t, nnz_t>
-    <<<grid, block, 0, stream>>>(bitmap, num_rows, num_cols, bitmap_num, nnz_per_row);
+  calc_nnz_by_rows_kernel<bitmap_t, index_t, nnz_t><<<grid, block, 0, stream>>>(
+    bitmap, num_rows, num_cols, bitmap_num, sub_col_nnz, bits_per_sub_col);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/*
-  Execute the exclusive_scan within one warp with no inter-warp communication.
-  This function calculates the exclusive prefix sum of `value` across threads within the same warp.
-  Each thread in the warp will end up with the sum of all the values of the threads with lower IDs
-  in the same warp, with the first thread always getting a sum of 0.
-*/
-template <typename value_t>
-RAFT_DEVICE_INLINE_FUNCTION value_t warp_exclusive_scan(value_t value)
-{
-  int lane_id           = threadIdx.x & 0x1f;
-  value_t shifted_value = __shfl_up_sync(0xffffffff, value, 1, warpSize);
-  if (lane_id == 0) shifted_value = 0;
-
-  value_t sum = shifted_value;
-
-  for (int i = 1; i < warpSize; i *= 2) {
-    value_t n = __shfl_up_sync(0xffffffff, sum, i, warpSize);
-    if (lane_id >= i) { sum += n; }
-  }
-  return sum;
-}
-
-// Threads per block in fill_indices_by_rows_kernel.
-static const constexpr int fill_indices_by_rows_tpb = 32;
-
 template <typename bitmap_t, typename index_t, typename nnz_t, bool check_nnz>
-RAFT_KERNEL __launch_bounds__(fill_indices_by_rows_tpb)
+RAFT_KERNEL __launch_bounds__(bitmap_to_csr_tpb)
   fill_indices_by_rows_kernel(const bitmap_t* bitmap,
-                              const index_t* indptr,
-                              index_t num_rows,
-                              index_t num_cols,
+                              index_t* indptr,
+                              size_t num_rows,
+                              size_t num_cols,
                               nnz_t nnz,
-                              index_t bitmap_num,
-                              index_t* indices)
+                              index_t* indices,
+                              nnz_t* sub_col_nnz,
+                              index_t bits_per_sub_col)
 {
-  constexpr bitmap_t FULL_MASK      = ~bitmap_t(0u);
   constexpr bitmap_t ONE            = bitmap_t(1u);
   constexpr index_t BITS_PER_BITMAP = sizeof(bitmap_t) * 8;
 
-  int lane_id = threadIdx.x & 0x1f;
+  using mutable_bitmap_t = typename std::remove_const_t<bitmap_t>;
+  using BlockScan        = cub::BlockScan<int, bitmap_to_csr_tpb>;
+
+  __shared__ typename BlockScan::TempStorage scan_storage;
+
+  const auto tid = threadIdx.x;
+  const auto row = blockIdx.x;
+
+  const auto num_sub_cols = gridDim.y;
+  const auto sub_col      = blockIdx.y;
 
   // Ensure the HBM allocated for CSR values is sufficient to handle all non-zero bitmap bits.
   // An assert will trigger if the allocated HBM is insufficient when `NDEBUG` isn't defined.
   // Note: Assertion is active only if `NDEBUG` is undefined.
   if constexpr (check_nnz) {
-    if (lane_id == 0) { assert(nnz < indptr[num_rows]); }
+    if (tid == 0) { assert(nnz < sub_col_nnz[num_rows * num_sub_cols]); }
   }
 
+  size_t s_bit = size_t(row) * num_cols + sub_col * bits_per_sub_col;
+  size_t e_bit = min(s_bit + bits_per_sub_col, size_t(num_cols) * (row + 1));
+
+  size_t l_sum = 0;
+  __shared__ size_t g_sum;
+
+  index_t s_offset  = s_bit % BITS_PER_BITMAP;
+  size_t bitmap_idx = s_bit / BITS_PER_BITMAP;
+
+  if (tid == 0 && row == 0 && sub_col == 0) { indptr[0] = 0; }
+  if (tid == 0 && sub_col == 0) { indptr[row + 1] = sub_col_nnz[(row + 1) * num_sub_cols]; }
+
+  size_t g_nnz                   = sub_col_nnz[sub_col + row * num_sub_cols];
+  index_t* sub_cols_indices_addr = indices + g_nnz;
+
+  bool guard[BITS_PER_BITMAP];
+
+  index_t g_bits = sub_col * bits_per_sub_col + tid * BITS_PER_BITMAP;
+
+  if (tid == 0 && s_offset != 0) {
+    mutable_bitmap_t l_bitmap = bitmap[bitmap_idx];
+    l_bitmap >>= s_offset;
+
+    size_t remaining_bits = min(size_t(BITS_PER_BITMAP - s_offset), e_bit - s_bit);
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
+    }
+
+#pragma unroll
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      guard[i] = l_bitmap & (ONE << i);
+    }
 #pragma unroll
-  for (index_t row = blockIdx.x; row < num_rows; row += gridDim.x) {
-    index_t g_sum      = 0;
-    index_t s_bit      = row * num_cols;
-    index_t e_bit      = s_bit + num_cols;
-    index_t indptr_row = indptr[row];
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      stg(index_t(i + g_bits), sub_cols_indices_addr + l_sum, guard[i]);
+      l_sum += guard[i];
+    }
+  }
+
+  if (tid == 0) { g_sum = l_sum; }
+  __syncthreads();
+
+  if (s_offset != 0) {
+    s_bit += (BITS_PER_BITMAP - s_offset);
+    g_bits += (BITS_PER_BITMAP - s_offset);
+  }
+
+  for (size_t bit_idx = s_bit; bit_idx < e_bit; bit_idx += BITS_PER_BITMAP * blockDim.x) {
+    mutable_bitmap_t l_bitmap = 0;
+    bitmap_idx                = bit_idx / BITS_PER_BITMAP + tid;
+
+    if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
+
+    index_t remaining_bits = min(BITS_PER_BITMAP, index_t(e_bit - bitmap_idx * BITS_PER_BITMAP));
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
+    }
+
+    int l_bits    = raft::detail::popc(l_bitmap);
+    int l_sum_32b = 0;
+    BlockScan(scan_storage).InclusiveSum(l_bits, l_sum_32b);
+    l_sum = l_sum_32b + g_sum - l_bits;
+    __syncthreads();
 
 #pragma unroll
-    for (index_t offset = 0; offset < num_cols; offset += BITS_PER_BITMAP * warpSize) {
-      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      std::remove_const_t<bitmap_t> l_bitmap = 0;
-      index_t l_offset = offset + lane_id * BITS_PER_BITMAP - (s_bit % BITS_PER_BITMAP);
-
-      if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
-
-      if (s_bit > bitmap_idx * BITS_PER_BITMAP) {
-        l_bitmap >>= (s_bit - bitmap_idx * BITS_PER_BITMAP);
-        l_bitmap <<= (s_bit - bitmap_idx * BITS_PER_BITMAP);
-      }
-
-      if ((bitmap_idx + 1) * BITS_PER_BITMAP > e_bit) {
-        l_bitmap <<= ((bitmap_idx + 1) * BITS_PER_BITMAP - e_bit);
-        l_bitmap >>= ((bitmap_idx + 1) * BITS_PER_BITMAP - e_bit);
-      }
-
-      index_t l_sum =
-        g_sum + warp_exclusive_scan(static_cast<index_t>(raft::detail::popc(l_bitmap)));
-
-      for (int i = 0; i < BITS_PER_BITMAP; i++) {
-        if (l_bitmap & (ONE << i)) {
-          indices[indptr_row + l_sum] = l_offset + i;
-          l_sum++;
-        }
-      }
-      g_sum = __shfl_sync(0xffffffff, l_sum, warpSize - 1);
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      guard[i] = l_bitmap & (ONE << i);
     }
+#pragma unroll
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      stg(index_t(i + g_bits), sub_cols_indices_addr + l_sum, guard[i]);
+      l_sum += guard[i];
+    }
+
+    if (threadIdx.x == (bitmap_to_csr_tpb - 1)) { g_sum += (l_sum_32b); }
+    g_bits += BITS_PER_BITMAP * blockDim.x;
   }
 }
 
 template <typename bitmap_t, typename index_t, typename nnz_t, bool check_nnz = false>
 void fill_indices_by_rows(raft::resources const& handle,
                           const bitmap_t* bitmap,
-                          const index_t* indptr,
+                          index_t* indptr,
                           index_t num_rows,
                           index_t num_cols,
                           nnz_t nnz,
-                          index_t* indices)
+                          index_t* indices,
+                          nnz_t* sub_col_nnz,
+                          index_t bits_per_sub_col,
+                          size_t sub_nnz_size)
 {
-  auto stream              = resource::get_cuda_stream(handle);
-  const index_t total      = num_rows * num_cols;
-  const index_t bitmap_num = raft::ceildiv(total, index_t(sizeof(bitmap_t) * 8));
-
-  int dev_id, sm_count, blocks_per_sm;
-
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm,
-    fill_indices_by_rows_kernel<bitmap_t, index_t, nnz_t, check_nnz>,
-    fill_indices_by_rows_tpb,
-    0);
-
-  index_t max_active_blocks = sm_count * blocks_per_sm;
-  auto grid                 = std::min(max_active_blocks, num_rows);
-  auto block                = fill_indices_by_rows_tpb;
-
-  fill_indices_by_rows_kernel<bitmap_t, index_t, nnz_t, check_nnz>
-    <<<grid, block, 0, stream>>>(bitmap, indptr, num_rows, num_cols, nnz, bitmap_num, indices);
+  auto stream  = resource::get_cuda_stream(handle);
+  auto block_x = num_rows;
+  auto block_y = sub_nnz_size / num_rows;
+  dim3 grid(block_x, block_y, 1);
+
+  auto block = bitmap_to_csr_tpb;
+
+  fill_indices_by_rows_kernel<bitmap_t, index_t, nnz_t, check_nnz><<<grid, block, 0, stream>>>(
+    bitmap, indptr, num_rows, num_cols, nnz, indices, sub_col_nnz, bits_per_sub_col);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -252,6 +280,7 @@ void bitmap_to_csr(raft::resources const& handle,
                    raft::core::bitmap_view<bitmap_t, index_t> bitmap,
                    csr_matrix_t& csr)
 {
+  using nnz_t   = typename csr_matrix_t::nnz_type;
   auto csr_view = csr.structure_view();
 
   if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0 || csr_view.get_nnz() == 0) {
@@ -274,25 +303,50 @@ void bitmap_to_csr(raft::resources const& handle,
 
   RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
 
-  calc_nnz_by_rows(handle, bitmap.data(), csr_view.get_n_rows(), csr_view.get_n_cols(), indptr);
-  thrust::exclusive_scan(thrust_policy, indptr, indptr + csr_view.get_n_rows() + 1, indptr);
+  size_t sub_nnz_size      = 0;
+  index_t bits_per_sub_col = 0;
+
+  // Get buffer size and number of bits per each sub-columns
+  calc_nnz_by_rows(handle,
+                   bitmap.data(),
+                   csr_view.get_n_rows(),
+                   csr_view.get_n_cols(),
+                   static_cast<nnz_t*>(nullptr),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
+  rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
+
+  calc_nnz_by_rows(handle,
+                   bitmap.data(),
+                   csr_view.get_n_rows(),
+                   csr_view.get_n_cols(),
+                   sub_nnz.data(),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  thrust::exclusive_scan(
+    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
 
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
     index_t nnz = 0;
     RAFT_CUDA_TRY(cudaMemcpyAsync(
-      &nnz, indptr + csr_view.get_n_rows(), sizeof(index_t), cudaMemcpyDeviceToHost, stream));
+      &nnz, sub_nnz.data() + sub_nnz_size, sizeof(index_t), cudaMemcpyDeviceToHost, stream));
     resource::sync_stream(handle);
     csr.initialize_sparsity(nnz);
   }
   constexpr bool check_nnz = is_device_csr_sparsity_preserving_v<csr_matrix_t>;
-  fill_indices_by_rows<bitmap_t, index_t, typename csr_matrix_t::nnz_type, check_nnz>(
-    handle,
-    bitmap.data(),
-    indptr,
-    csr_view.get_n_rows(),
-    csr_view.get_n_cols(),
-    csr_view.get_nnz(),
-    indices);
+  fill_indices_by_rows<bitmap_t, index_t, nnz_t, check_nnz>(handle,
+                                                            bitmap.data(),
+                                                            indptr,
+                                                            csr_view.get_n_rows(),
+                                                            csr_view.get_n_cols(),
+                                                            csr_view.get_nnz(),
+                                                            indices,
+                                                            sub_nnz.data(),
+                                                            bits_per_sub_col,
+                                                            sub_nnz_size);
 
   thrust::fill_n(thrust_policy,
                  csr.get_elements().data(),
diff --git a/cpp/include/raft/util/device_loads_stores.cuh b/cpp/include/raft/util/device_loads_stores.cuh
index 2c954ec99a..c1b668fed6 100644
--- a/cpp/include/raft/util/device_loads_stores.cuh
+++ b/cpp/include/raft/util/device_loads_stores.cuh
@@ -739,4 +739,46 @@ DI void block_copy(raft::device_span<T> dst, const raft::device_span<T> src)
 
 /** @} */
 
+/**
+ * @defgroup GlobalStores Global Store Operations
+ * @{
+ * @brief Perform conditional stores to global memory.
+ *
+ * These functions store data to a specified global memory address,
+ * controlled by a guard flag to enable conditional execution.
+ *
+ * @param[in] reg   The data to store in global memory.
+ *                  The type of `reg` determines the size of the store.
+ * @param[in] addr  The global memory address where the data will be stored.
+ * @param[in] guard A flag to conditionally enable the store operation.
+ *                  If `true`, the store is performed; otherwise, it is skipped
+ */
+DI void stg(const int& reg, void* addr, bool guard)
+{
+  asm volatile(
+    "{\n"
+    ".reg .pred p;\n"
+    "setp.ne.b32 p, %2, 0;\n"
+    "@p st.global.b32 [%0], %1;\n"
+    "}\n"
+    :
+    : "l"(addr), "r"(reg), "r"((int)guard)
+    : "memory");
+}
+
+DI void stg(const int64_t& reg, void* addr, bool guard)
+{
+  asm volatile(
+    "{\n"
+    ".reg .pred p;\n"
+    "setp.ne.b32 p, %2, 0;\n"
+    "@p st.global.b64 [%0], %1;\n"
+    "}\n"
+    :
+    : "l"(addr), "l"(reg), "r"((int)guard)
+    : "memory");
+}
+
+/** @} */
+
 }  // namespace raft
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 1cd49b0bbd..c1a495ea3d 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -249,7 +249,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
   index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
     for (auto& item : bitmap) {
@@ -257,7 +257,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
     }
 
     std::random_device rd;
-    std::mt19937 gen(rd());
+    std::mt19937 gen(random_number = rd());
     std::uniform_int_distribution<index_t> dis(0, total - 1);
 
     while (num_ones > 0) {
@@ -318,8 +318,8 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
       size_t start_idx = row_ptrs1[i];
       size_t end_idx   = row_ptrs1[i + 1];
 
-      std::vector<int> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
-      std::vector<int> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
+      std::vector<index_t> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
+      std::vector<index_t> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
 
       std::sort(cols1.begin(), cols1.end());
       std::sort(cols2.begin(), cols2.end());
@@ -396,9 +396,13 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
 
     resource::sync_stream(handle);
 
-    ASSERT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h));
-    ASSERT_TRUE(raft::devArrMatch<value_t>(
-      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream));
+    EXPECT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h))
+      << " n_row: " << params.n_rows << ", n_cols: " << params.n_cols << ", nnz: " << nnz
+      << ", random_number: " << random_number;
+    EXPECT_TRUE(raft::devArrMatch<value_t>(
+      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream))
+      << " n_row: " << params.n_rows << ", n_cols: " << params.n_cols << ", nnz: " << nnz
+      << ", random_number: " << random_number;
   }
 
  protected:
@@ -418,6 +422,8 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
   rmm::device_uvector<index_t> indptr_expected_d;
   rmm::device_uvector<index_t> indices_expected_d;
   rmm::device_uvector<float> values_expected_d;
+
+  unsigned int random_number;
 };
 
 using BitmapToCSRTestI = BitmapToCSRTest<uint32_t, int, float>;
@@ -426,40 +432,50 @@ TEST_P(BitmapToCSRTestI, Result) { Run(); }
 using BitmapToCSRTestL = BitmapToCSRTest<uint32_t, int64_t, float>;
 TEST_P(BitmapToCSRTestL, Result) { Run(); }
 
+using BitmapToCSRTestLOnLargeSize = BitmapToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitmapToCSRTestLOnLargeSize, Result) { Run(); }
+
 template <typename index_t>
 const std::vector<BitmapToCSRInputs<index_t>> bitmaptocsr_inputs = {
-  {0, 0, 0.2, false},
-  {10, 32, 0.4, false},
-  {10, 3, 0.2, false},
-  {32, 1024, 0.4, false},
-  {1024, 1048576, 0.01, false},
-  {1024, 1024, 0.4, false},
-  {64 * 1024 + 10, 2, 0.3, false},  // 64K + 10 is slightly over maximum of blockDim.y
-  {16, 16, 0.3, false},             // No peeling-remainder
-  {17, 16, 0.3, false},             // Check peeling-remainder
-  {18, 16, 0.3, false},             // Check peeling-remainder
-  {32 + 9, 33, 0.2, false},         // Check peeling-remainder
-  {2, 33, 0.2, false},              // Check peeling-remainder
-  {0, 0, 0.2, true},
-  {10, 32, 0.4, true},
-  {10, 3, 0.2, true},
-  {32, 1024, 0.4, true},
-  {1024, 1048576, 0.01, true},
-  {1024, 1024, 0.4, true},
-  {64 * 1024 + 10, 2, 0.3, true},  // 64K + 10 is slightly over maximum of blockDim.y
-  {16, 16, 0.3, true},             // No peeling-remainder
-  {17, 16, 0.3, true},             // Check peeling-remainder
-  {18, 16, 0.3, true},             // Check peeling-remainder
-  {32 + 9, 33, 0.2, true},         // Check peeling-remainder
-  {2, 33, 0.2, true},              // Check peeling-remainder
+  {0, 0, 0.8, false},
+  {10, 32, 0.6, false},
+  {10, 3, 0.8, false},
+  {32, 1024, 0.6, false},
+  {1024, 1048576, 0.99, false},
+  {1024, 1024, 0.6, false},
+  {64 * 1024 + 10, 2, 0.7, false},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.7, false},             // No peeling-remainder
+  {17, 16, 0.7, false},             // Check peeling-remainder
+  {18, 16, 0.7, false},             // Check peeling-remainder
+  {32 + 9, 33, 0.8, false},         // Check peeling-remainder
+  {2, 33, 0.8, false},              // Check peeling-remainder
+  {0, 0, 0.8, true},
+  {10, 32, 0.6, true},
+  {10, 3, 0.8, true},
+  {32, 1024, 0.6, true},
+  {1024, 1048576, 0.99, true},
+  {1024, 1024, 0.6, true},
+  {64 * 1024 + 10, 2, 0.7, true},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.7, true},             // No peeling-remainder
+  {17, 16, 0.7, true},             // Check peeling-remainder
+  {18, 16, 0.7, true},             // Check peeling-remainder
+  {32 + 9, 33, 0.8, true},         // Check peeling-remainder
+  {2, 33, 0.8, true},              // Check peeling-remainder
 };
 
+template <typename index_t>
+const std::vector<BitmapToCSRInputs<index_t>> bitmaptocsr_large_inputs = {
+  {100, 100000000, 0.99, true}, {100, 100000000, 0.95, false}, {100, 100000000 + 17, 0.95, false}};
+
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
                         BitmapToCSRTestI,
                         ::testing::ValuesIn(bitmaptocsr_inputs<int>));
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
                         BitmapToCSRTestL,
                         ::testing::ValuesIn(bitmaptocsr_inputs<int64_t>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitmapToCSRTestLOnLargeSize,
+                        ::testing::ValuesIn(bitmaptocsr_large_inputs<int64_t>));
 
 }  // namespace sparse
 }  // namespace raft

From bfd190687ee396374b7106d9ac26add73b57b22a Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 17 Dec 2024 12:08:26 -0600
Subject: [PATCH 11/37] reduce duplication, removed unused things in
 dependencies.yaml (#2529)

Proposes some small cleanup for `dependencies.yaml`

* removes `rapids_build_setuptools` dependency group
  - *#2497 removed the last use of `setuptools` here*
* breaks `cuda-python` and `rmm` out into `depends_on_*` groups to reduce duplication, and for consistency with other RAPIDS projects ([docs explaining this](https://github.com/rapidsai/build-planning/blob/d9e3c606d95c835ee384ac6480a4af0ac6cb024a/docs/docs/packaging.md#L181))
* alphabetizes lists

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2529
---
 dependencies.yaml | 156 ++++++++++++++++++----------------------------
 1 file changed, 60 insertions(+), 96 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 37ea223a01..dc1807fbf9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,22 +6,22 @@ files:
       cuda: ["11.8", "12.5"]
       arch: [x86_64, aarch64]
     includes:
-      - rapids_build
-      - build_pylibraft
+      - checks
       - cuda
       - cuda_version
+      - depends_on_cuda_python
       - depends_on_cupy
       - depends_on_distributed_ucxx
+      - depends_on_rmm
       - develop
-      - checks
-      - test_libraft
       - docs
-      - rapids_build_setuptools
+      - rapids_build
       - rapids_build_skbuild
-      - run_raft_dask
       - run_pylibraft
-      - test_python_common
+      - run_raft_dask
+      - test_libraft
       - test_pylibraft
+      - test_python_common
   test_cpp:
     output: none
     includes:
@@ -31,10 +31,10 @@ files:
     output: none
     includes:
       - cuda_version
+      - depends_on_cupy
       - py_version
-      - test_python_common
       - test_pylibraft
-      - depends_on_cupy
+      - test_python_common
   checks:
     output: none
     includes:
@@ -62,8 +62,9 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
+      - depends_on_cuda_python
+      - depends_on_rmm
       - rapids_build
-      - build_pylibraft
   py_run_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -71,6 +72,8 @@ files:
       table: project
     includes:
       - cuda_wheels
+      - depends_on_cuda_python
+      - depends_on_rmm
       - run_pylibraft
   py_test_pylibraft:
     output: pyproject
@@ -79,9 +82,9 @@ files:
       table: project.optional-dependencies
       key: test
     includes:
-      - test_python_common
-      - test_pylibraft
       - depends_on_cupy
+      - test_pylibraft
+      - test_python_common
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -96,16 +99,16 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
-      - rapids_build
       - depends_on_ucx_build
+      - rapids_build
   py_run_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
     extras:
       table: project
     includes:
-      - run_raft_dask
       - depends_on_distributed_ucxx
+      - run_raft_dask
   py_test_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -125,7 +128,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
+          - rapids-build-backend>=0.3.0,<0.4.0.dev0
       - output_types: [conda]
         packages:
           - scikit-build-core>=0.10.0
@@ -180,44 +183,6 @@ dependencies:
           - matrix: {cuda: "11.2", arch: aarch64}
             packages: [nvcc_linux-aarch64=11.2]
 
-  build_pylibraft:
-    common:
-      - output_types: [conda]
-        packages:
-          - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [conda, requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - &cuda_python12 cuda-python>=12.6.2,<13.0a0
-          - matrix:
-              cuda: "11.*"
-            packages:
-              - &cuda_python11 cuda-python>=11.8.5,<12.0a0
-          - matrix:
-            packages:
-              - &cuda_python cuda-python
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu12 rmm-cu12==25.2.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu11 rmm-cu11==25.2.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed] }
   checks:
     common:
       - output_types: [conda, requirements]
@@ -398,13 +363,6 @@ dependencies:
           - recommonmark
           - sphinx-copybutton
           - sphinx-markdown-tables
-  rapids_build_setuptools:
-    common:
-      - output_types: [requirements, pyproject]
-        packages:
-          - wheel
-          - setuptools
-          - *rapids_build_backend
   py_version:
     specific:
       - output_types: conda
@@ -429,42 +387,6 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - numpy>=1.23,<3.0a0
-      - output_types: [conda]
-        packages:
-          - *rmm_unsuffixed
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for cudf and rmm.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [conda, requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - *cuda_python12
-          - matrix:
-              cuda: "11.*"
-            packages:
-              - *cuda_python11
-          - matrix:
-            packages:
-              - *cuda_python
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - *rmm_cu12
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - *rmm_cu11
-          - {matrix: null, packages: [*rmm_unsuffixed]}
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
@@ -511,6 +433,21 @@ dependencies:
         packages:
           - scikit-learn
           - scipy
+  depends_on_cuda_python:
+    specific:
+      - output_types: [conda, requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-python>=12.6.2,<13.0a0
+          - matrix:
+              cuda: "11.*"
+            packages:
+              - cuda-python>=11.8.5,<12.0a0
+          - matrix:
+            packages:
+              - cuda-python
   depends_on_distributed_ucxx:
     common:
       - output_types: conda
@@ -537,6 +474,33 @@ dependencies:
             packages:
               - distributed-ucxx-cu11==0.42.*,>=0.0.0a0
           - {matrix: null, packages: [*distributed_ucxx_unsuffixed]}
+  depends_on_rmm:
+    common:
+      - output_types: conda
+        packages:
+          - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==25.2.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *rmm_unsuffixed
   depends_on_ucx_build:
     common:
       - output_types: conda

From d7e68f55c58493bc9cf1fbe4eb775a9593891c58 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Wed, 18 Dec 2024 14:30:22 +0900
Subject: [PATCH 12/37] [DOC] Fix sample codes (#2518)

`raft::raft::resources` -> `raft::resources`

Authors:
  - tsuki (https://github.com/enp1s0)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2518
---
 cpp/include/raft/cluster/kmeans.cuh                 | 8 ++++----
 cpp/include/raft/comms/std_comms.hpp                | 4 ++--
 cpp/include/raft/distance/distance-inl.cuh          | 2 +-
 cpp/include/raft/neighbors/epsilon_neighborhood.cuh | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index 38318e8ec8..ee1fc83a9b 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -52,7 +52,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::raft::resources handle;
+ *   raft::resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -61,7 +61,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  *               params,
  *               X,
  *               std::nullopt,
- *               centroids,
+ *               centroids.view(),
  *               raft::make_scalar_view(&inertia),
  *               raft::make_scalar_view(&n_iter));
  * @endcode
@@ -107,7 +107,7 @@ template <typename DataT, typename IndexT>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::raft::resources handle;
+ *   raft::resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -175,7 +175,7 @@ template <typename DataT, typename IndexT>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::raft::resources handle;
+ *   raft::resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 667c8be285..8481360897 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -52,7 +52,7 @@ using std_comms = detail::std_comms;
  * #include <raft/core/device_mdarray.hpp>
  *
  * ncclComm_t nccl_comm;
- * raft::raft::resources handle;
+ * raft::resources handle;
  *
  * build_comms_nccl_only(&handle, nccl_comm, 5, 0);
  * ...
@@ -98,7 +98,7 @@ void build_comms_nccl_only(resources* handle, ncclComm_t nccl_comm, int num_rank
  * #include <raft/core/device_mdarray.hpp>
  *
  * ncclComm_t nccl_comm;
- * raft::raft::resources handle;
+ * raft::resources handle;
  * ucp_worker_h ucp_worker;
  * ucp_ep_h *ucp_endpoints_arr;
  *
diff --git a/cpp/include/raft/distance/distance-inl.cuh b/cpp/include/raft/distance/distance-inl.cuh
index 13c9d57efd..d5f8d1cfe1 100644
--- a/cpp/include/raft/distance/distance-inl.cuh
+++ b/cpp/include/raft/distance/distance-inl.cuh
@@ -366,7 +366,7 @@ void pairwise_distance(raft::resources const& handle,
  * #include <raft/random/make_blobs.cuh>
  * #include <raft/distance/distance.cuh>
  *
- * raft::raft::resources handle;
+ * raft::resources handle;
  * int n_samples = 5000;
  * int n_features = 50;
  *
diff --git a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
index bade4385fb..c2f531263d 100644
--- a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
@@ -76,7 +76,7 @@ void epsUnexpL2SqNeighborhood(bool* adj,
  *  #include <raft/core/resources.hpp>
  *  #include <raft/core/device_mdarray.hpp>
  *  using namespace raft::neighbors;
- *  raft::raft::resources handle;
+ *  raft::resources handle;
  *  ...
  *  auto adj = raft::make_device_matrix<bool>(handle, m * n);
  *  auto vd = raft::make_device_vector<int>(handle, m+1);
@@ -120,4 +120,4 @@ void eps_neighbors_l2sq(raft::resources const& handle,
 
 }  // namespace raft::neighbors::epsilon_neighborhood
 
-#endif
\ No newline at end of file
+#endif

From dee71f89e4bdf5ce3ff982e53da9c6c81d883608 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 30 Dec 2024 11:44:33 -0800
Subject: [PATCH 13/37] Check if nightlies have succeeded recently enough
 (#2533)

Contributes to https://github.com/rapidsai/build-planning/issues/127

This PR cannot be merged unless nightly CI has passed within the past 7 days, so if it remains unmerged that will itself be an indication that nightly CI needs fixing.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2533
---
 .github/workflows/pr.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 965943e726..a270df1dfa 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -30,6 +31,18 @@ jobs:
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: raft
   changed-files:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02

From eef9a4fa9a39d4349ed699b097a3e3ff6c78cbc4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 30 Dec 2024 11:48:07 -0800
Subject: [PATCH 14/37] Switch over to rapids-logger (#2530)

This PR removes raft's implementation of a logger in favor of the centralized one in [rapids-logger](https://github.com/rapidsai/rapids-logger). Consumers still get the benefits of a PImpl idiom, but now that is primarily handled by using the appropriate targets (if necessary the impl header is of course still available for direct inclusion). This change paves the way for ensuring consistent fmt/spdlog (lack of) linkage throughout RAPIDS conda and wheel packages.

This PR requires https://github.com/rapidsai/rapids-logger/pull/1

Contributes to https://github.com/rapidsai/build-planning/issues/104

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2530
---
 cpp/CMakeLists.txt                            |  28 +++-
 cpp/include/raft/cluster/detail/kmeans.cuh    |   8 +-
 .../raft/cluster/detail/kmeans_balanced.cuh   |   1 +
 cpp/include/raft/cluster/kmeans_types.hpp     |   2 +-
 cpp/include/raft/common/logger.hpp            |  24 ---
 cpp/include/raft/core/cublas_macros.hpp       |   3 -
 cpp/include/raft/core/cusolver_macros.hpp     |   7 +-
 cpp/include/raft/core/cusparse_macros.hpp     |   2 -
 .../raft/core/detail/callback_sink.hpp        |  71 --------
 .../core/detail/fail_container_policy.hpp     |   2 +-
 cpp/include/raft/core/detail/logger.hpp       |  24 ---
 cpp/include/raft/core/logger-ext.hpp          | 152 -----------------
 cpp/include/raft/core/logger-inl.hpp          | 153 ------------------
 cpp/include/raft/core/logger-macros.hpp       |  95 ++---------
 cpp/include/raft/core/logger.hpp              |  23 ---
 .../raft/neighbors/detail/ivf_flat_build.cuh  |   1 +
 .../neighbors/detail/ivf_flat_search-inl.cuh  |   3 +-
 .../raft/solver/detail/lap_kernels.cuh        |   3 +-
 .../raft/sparse/solver/detail/lanczos.cuh     |   2 +-
 cpp/src/core/logger.cpp                       |  16 --
 cpp/test/CMakeLists.txt                       |   5 +
 cpp/test/core/device_resources_manager.cpp    |   2 +-
 cpp/test/core/logger.cpp                      |  57 +++----
 docs/source/developer_guide.md                |   4 +-
 24 files changed, 87 insertions(+), 601 deletions(-)
 delete mode 100644 cpp/include/raft/common/logger.hpp
 delete mode 100644 cpp/include/raft/core/detail/callback_sink.hpp
 delete mode 100644 cpp/include/raft/core/detail/logger.hpp
 delete mode 100644 cpp/include/raft/core/logger-ext.hpp
 delete mode 100644 cpp/include/raft/core/logger-inl.hpp
 delete mode 100644 cpp/include/raft/core/logger.hpp
 delete mode 100644 cpp/src/core/logger.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 78a4dbb913..06531941aa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -100,6 +100,17 @@ set_property(
 )
 message(VERBOSE "RAFT: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 
+# Set logging level
+set(LIBRAFT_LOGGING_LEVEL
+    "INFO"
+    CACHE STRING "Choose the logging level."
+)
+set_property(
+  CACHE LIBRAFT_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL"
+                                       "OFF"
+)
+message(VERBOSE "RAFT: LIBRAFT_LOGGING_LEVEL = '${LIBRAFT_LOGGING_LEVEL}'.")
+
 # ##################################################################################################
 # * Conda environment detection ----------------------------------------------
 
@@ -152,6 +163,13 @@ include(cmake/modules/ConfigureCUDA.cmake)
 # add third party dependencies using CPM
 rapids_cpm_init()
 
+# Not using rapids-cmake since we never want to find, always download.
+CPMAddPackage(
+  NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW FALSE GIT_TAG
+  4df3ee70c6746fd1b6c0dc14209dae2e2d4378c6 VERSION 4df3ee70c6746fd1b6c0dc14209dae2e2d4378c6
+)
+rapids_make_logger(raft LOGGER_HEADER_DIR include/raft/core EXPORT_SET raft-exports)
+
 # CCCL before rmm/cuco so we get the right version of CCCL
 include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
@@ -182,7 +200,7 @@ target_include_directories(
 # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target.
 target_link_libraries(
   raft INTERFACE rmm::rmm rmm::rmm_logger spdlog::spdlog_header_only cuco::cuco
-                 nvidia::cutlass::cutlass CCCL::CCCL
+                 nvidia::cutlass::cutlass CCCL::CCCL raft_logger
 )
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -190,6 +208,9 @@ target_compile_options(
   raft INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
                  --expt-relaxed-constexpr>
 )
+target_compile_definitions(
+  raft INTERFACE "RAFT_LOG_ACTIVE_LEVEL=RAFT_LOG_LEVEL_${LIBRAFT_LOGGING_LEVEL}"
+)
 
 set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix})
 set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix})
@@ -265,7 +286,6 @@ set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 if(RAFT_COMPILE_LIBRARY)
   add_library(
     raft_objs OBJECT
-    src/core/logger.cpp
     src/linalg/detail/coalesced_reduction.cu
     src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
     src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
@@ -318,8 +338,8 @@ if(RAFT_COMPILE_LIBRARY)
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
     target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
   endforeach()
-  target_link_libraries(raft_lib PRIVATE rmm::rmm_logger_impl)
-  target_link_libraries(raft_lib_static PRIVATE rmm::rmm_logger_impl)
+  target_link_libraries(raft_lib PRIVATE rmm::rmm_logger_impl raft_logger_impl)
+  target_link_libraries(raft_lib_static PRIVATE rmm::rmm_logger_impl raft_logger_impl)
 endif()
 
 if(TARGET raft_lib AND (NOT TARGET raft::raft_lib))
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 4efeedcbaa..4203f0969b 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -369,7 +369,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      rmm::device_uvector<char>& workspace)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -865,7 +865,7 @@ void kmeans_fit(raft::resources const& handle,
       params.n_clusters);
   }
 
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
 
   // Allocate memory
   rmm::device_uvector<char> workspace(0, stream);
@@ -1010,7 +1010,7 @@ void kmeans_predict(raft::resources const& handle,
   RAFT_EXPECTS(centroids.extent(1) == n_features,
                "invalid parameter (centroids.extent(1) != n_features)");
 
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
   auto metric = params.metric;
 
   // Allocate memory
@@ -1201,7 +1201,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<DataT> X_new)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_transform");
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index 0a5a3ba5aa..5dcd679bd5 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -20,6 +20,7 @@
 #include <raft/cluster/kmeans_balanced_types.hpp>
 #include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger-macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp
index 4d956ad7a0..fbedd58417 100644
--- a/cpp/include/raft/cluster/kmeans_types.hpp
+++ b/cpp/include/raft/cluster/kmeans_types.hpp
@@ -82,7 +82,7 @@ struct KMeansParams : kmeans_base_params {
   /**
    * verbosity level.
    */
-  int verbosity = RAFT_LEVEL_INFO;
+  level_enum verbosity = level_enum::info;
 
   /**
    * Seed to the random number generator.
diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
deleted file mode 100644
index 77483e577d..0000000000
--- a/cpp/include/raft/common/logger.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * This file is deprecated and will be removed in release 22.08.
- * Please use the include/core/logger.hpp instead.
- */
-
-#pragma once
-
-#include <raft/core/logger.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index b69b121161..6c195d8a6f 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -23,9 +23,6 @@
 
 #include <cublas_v2.h>
 
-///@todo: enable this once we have logger enabled
-// #include <cuml/common/logger.hpp>
-
 #include <cstdint>
 
 #define _CUBLAS_ERR_TO_STR(err) \
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index 74a8b7c36c..beaf2d74dc 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -19,11 +19,10 @@
 
 #pragma once
 
+#include <raft/util/cudart_utils.hpp>
+
 #include <cusolverDn.h>
 #include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-// #include <cuml/common/logger.hpp>
-#include <raft/util/cudart_utils.hpp>
 
 #include <type_traits>
 
@@ -135,4 +134,4 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 #define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
index 5a1968b529..2a1df14345 100644
--- a/cpp/include/raft/core/cusparse_macros.hpp
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -19,8 +19,6 @@
 #include <raft/core/error.hpp>
 
 #include <cusparse.h>
-///@todo: enable this once logging is enabled
-// #include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
   case err: return #err;
diff --git a/cpp/include/raft/core/detail/callback_sink.hpp b/cpp/include/raft/core/detail/callback_sink.hpp
deleted file mode 100644
index a110af5c76..0000000000
--- a/cpp/include/raft/core/detail/callback_sink.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <iostream>
-#include <mutex>
-
-#define SPDLOG_HEADER_ONLY
-#include <spdlog/common.h>
-#include <spdlog/details/log_msg.h>
-#include <spdlog/sinks/base_sink.h>
-
-namespace spdlog::sinks {
-
-typedef void (*LogCallback)(int lvl, const char* msg);
-
-template <class Mutex>
-class CallbackSink : public base_sink<Mutex> {
- public:
-  explicit CallbackSink(std::string tag      = "spdlog",
-                        LogCallback callback = nullptr,
-                        void (*flush)()      = nullptr)
-    : _callback{callback}, _flush{flush} {};
-
-  void set_callback(LogCallback callback) { _callback = callback; }
-  void set_flush(void (*flush)()) { _flush = flush; }
-
- protected:
-  void sink_it_(const details::log_msg& msg) override
-  {
-    spdlog::memory_buf_t formatted;
-    base_sink<Mutex>::formatter_->format(msg, formatted);
-    std::string msg_string = fmt::to_string(formatted);
-
-    if (_callback) {
-      _callback(static_cast<int>(msg.level), msg_string.c_str());
-    } else {
-      std::cout << msg_string;
-    }
-  }
-
-  void flush_() override
-  {
-    if (_flush) {
-      _flush();
-    } else {
-      std::cout << std::flush;
-    }
-  }
-
-  LogCallback _callback;
-  void (*_flush)();
-};
-
-using callback_sink_mt = CallbackSink<std::mutex>;
-using callback_sink_st = CallbackSink<details::null_mutex>;
-
-}  // end namespace spdlog::sinks
diff --git a/cpp/include/raft/core/detail/fail_container_policy.hpp b/cpp/include/raft/core/detail/fail_container_policy.hpp
index cf9d0887dd..f5f1bfb377 100644
--- a/cpp/include/raft/core/detail/fail_container_policy.hpp
+++ b/cpp/include/raft/core/detail/fail_container_policy.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/core/error.hpp>
-#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/thirdparty/mdspan/include/experimental/mdspan>
 
diff --git a/cpp/include/raft/core/detail/logger.hpp b/cpp/include/raft/core/detail/logger.hpp
deleted file mode 100644
index f3f52b46ae..0000000000
--- a/cpp/include/raft/core/detail/logger.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
-#pragma message(__FILE__                                                   \
-                  " is deprecated and will be removed in future releases." \
-                  " Please use the <raft/core/logger.hpp> version instead.")
-#endif
-
-#include <raft/core/logger.hpp>
diff --git a/cpp/include/raft/core/logger-ext.hpp b/cpp/include/raft/core/logger-ext.hpp
deleted file mode 100644
index 73fe463aba..0000000000
--- a/cpp/include/raft/core/logger-ext.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
-
-#include <memory>         // std::unique_ptr
-#include <string>         // std::string
-#include <unordered_map>  // std::unordered_map
-
-namespace raft {
-
-static const std::string RAFT_NAME = "raft";
-static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
-
-namespace detail {
-RAFT_INLINE_CONDITIONAL std::string format(const char* fmt, ...);
-}
-/**
- * @brief The main Logging class for raft library.
- *
- * This class acts as a thin wrapper over the underlying `spdlog` interface. The
- * design is done in this way in order to avoid us having to also ship `spdlog`
- * header files in our installation.
- *
- * @todo This currently only supports logging to stdout. Need to add support in
- *       future to add custom loggers as well [Issue #2046]
- */
-class logger {
- public:
-  // @todo setting the logger once per process with
-  logger(std::string const& name_ = "");
-  /**
-   * @brief Singleton method to get the underlying logger object
-   *
-   * @return the singleton logger object
-   */
-  static logger& get(std::string const& name = "");
-
-  /**
-   * @brief Set the logging level.
-   *
-   * Only messages with level equal or above this will be printed
-   *
-   * @param[in] level logging level
-   *
-   * @note The log level will actually be set only if the input is within the
-   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-   *       be ignored. See documentation of decisiontree for how this gets used
-   */
-  void set_level(int level);
-
-  /**
-   * @brief Set the logging pattern
-   *
-   * @param[in] pattern the pattern to be set. Refer this link
-   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-   *                    to know the right syntax of this pattern
-   */
-  void set_pattern(const std::string& pattern);
-
-  /**
-   * @brief Register a callback function to be run in place of usual log call
-   *
-   * @param[in] callback the function to be run on all logged messages
-   */
-  void set_callback(void (*callback)(int lvl, const char* msg));
-
-  /**
-   * @brief Register a flush function compatible with the registered callback
-   *
-   * @param[in] flush the function to use when flushing logs
-   */
-  void set_flush(void (*flush)());
-
-  /**
-   * @brief Tells whether messages will be logged for the given log level
-   *
-   * @param[in] level log level to be checked for
-   * @return true if messages will be logged for this level, else false
-   */
-  bool should_log_for(int level) const;
-  /**
-   * @brief Query for the current log level
-   *
-   * @return the current log level
-   */
-  int get_level() const;
-
-  /**
-   * @brief Get the current logging pattern
-   * @return the pattern
-   */
-  std::string get_pattern() const;
-
-  /**
-   * @brief Main logging method
-   *
-   * @param[in] level logging level of this message
-   * @param[in] fmt   C-like format string, followed by respective params
-   */
-  void log(int level, const char* fmt, ...);
-
-  /**
-   * @brief Flush logs by calling flush on underlying logger
-   */
-  void flush();
-
-  ~logger();
-
- private:
-  logger();
-  // pimpl pattern:
-  // https://learn.microsoft.com/en-us/cpp/cpp/pimpl-for-compile-time-encapsulation-modern-cpp?view=msvc-170
-  class impl;
-  std::unique_ptr<impl> pimpl;
-  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-};  // class logger
-
-/**
- * @brief An object used for scoped log level setting
- *
- * Instances of `raft::log_level_setter` will set RAFT logging to the level
- * indicated on construction and will revert to the previous set level on
- * destruction.
- */
-struct log_level_setter {
-  explicit log_level_setter(int level)
-  {
-    prev_level_ = logger::get(RAFT_NAME).get_level();
-    logger::get(RAFT_NAME).set_level(level);
-  }
-  ~log_level_setter() { logger::get(RAFT_NAME).set_level(prev_level_); }
-
- private:
-  int prev_level_;
-};  // class log_level_setter
-
-};  // namespace raft
diff --git a/cpp/include/raft/core/logger-inl.hpp b/cpp/include/raft/core/logger-inl.hpp
deleted file mode 100644
index ea5f4ea26e..0000000000
--- a/cpp/include/raft/core/logger-inl.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "logger-macros.hpp"
-
-#include <stdarg.h>
-
-#include <algorithm>
-#include <memory>
-#include <mutex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-// The logger-ext.hpp file contains the class declaration of the logger class.
-// In this case, it is okay to include the logger-ext.hpp file because it
-// contains no RAFT_EXPLICIT template instantiations.
-#include "logger-ext.hpp"
-
-#define SPDLOG_HEADER_ONLY
-#include <raft/core/detail/callback_sink.hpp>
-#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
-
-#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
-#include <spdlog/spdlog.h>                    // NOLINT
-
-namespace raft {
-
-namespace detail {
-
-inline std::string format(const char* fmt, va_list& vl)
-{
-  va_list vl_copy;
-  va_copy(vl_copy, vl);
-  int length = std::vsnprintf(nullptr, 0, fmt, vl_copy);
-  assert(length >= 0);
-  std::vector<char> buf(length + 1);
-  std::vsnprintf(buf.data(), length + 1, fmt, vl);
-  return std::string(buf.data());
-}
-
-RAFT_INLINE_CONDITIONAL std::string format(const char* fmt, ...)
-{
-  va_list vl;
-  va_start(vl, fmt);
-  std::string str = format(fmt, vl);
-  va_end(vl);
-  return str;
-}
-
-inline int convert_level_to_spdlog(int level)
-{
-  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
-  return RAFT_LEVEL_TRACE - level;
-}
-
-}  // namespace detail
-
-class logger::impl {  // defined privately here
-                      // ... all private data and functions: all of these
-                      //     can now change without recompiling callers ...
- public:
-  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-  std::shared_ptr<spdlog::logger> spdlogger;
-  std::string cur_pattern;
-  int cur_level;
-
-  impl(std::string const& name_ = "")
-    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-      cur_pattern()
-  {
-  }
-};  // class logger::impl
-
-RAFT_INLINE_CONDITIONAL logger::logger(std::string const& name_) : pimpl(new impl(name_))
-{
-  set_pattern(default_log_pattern);
-  set_level(RAFT_ACTIVE_LEVEL);
-}
-
-RAFT_INLINE_CONDITIONAL logger& logger::get(std::string const& name)
-{
-  if (log_map.find(name) == log_map.end()) { log_map[name] = std::make_shared<raft::logger>(name); }
-  return *log_map[name];
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_level(int level)
-{
-  level = raft::detail::convert_level_to_spdlog(level);
-  pimpl->spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_pattern(const std::string& pattern)
-{
-  pimpl->cur_pattern = pattern;
-  pimpl->spdlogger->set_pattern(pattern);
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_callback(void (*callback)(int lvl, const char* msg))
-{
-  pimpl->sink->set_callback(callback);
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
-
-RAFT_INLINE_CONDITIONAL bool logger::should_log_for(int level) const
-{
-  level        = raft::detail::convert_level_to_spdlog(level);
-  auto level_e = static_cast<spdlog::level::level_enum>(level);
-  return pimpl->spdlogger->should_log(level_e);
-}
-
-RAFT_INLINE_CONDITIONAL int logger::get_level() const
-{
-  auto level_e = pimpl->spdlogger->level();
-  return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-}
-
-RAFT_INLINE_CONDITIONAL std::string logger::get_pattern() const { return pimpl->cur_pattern; }
-
-RAFT_INLINE_CONDITIONAL void logger::log(int level, const char* fmt, ...)
-{
-  level        = raft::detail::convert_level_to_spdlog(level);
-  auto level_e = static_cast<spdlog::level::level_enum>(level);
-  // explicit check to make sure that we only expand messages when required
-  if (pimpl->spdlogger->should_log(level_e)) {
-    va_list vl;
-    va_start(vl, fmt);
-    auto msg = raft::detail::format(fmt, vl);
-    va_end(vl);
-    pimpl->spdlogger->log(level_e, msg);
-  }
-}
-
-RAFT_INLINE_CONDITIONAL void logger::flush() { pimpl->spdlogger->flush(); }
-
-RAFT_INLINE_CONDITIONAL logger::~logger() {}
-
-};  // namespace raft
diff --git a/cpp/include/raft/core/logger-macros.hpp b/cpp/include/raft/core/logger-macros.hpp
index 5ddb072067..e32440dcce 100644
--- a/cpp/include/raft/core/logger-macros.hpp
+++ b/cpp/include/raft/core/logger-macros.hpp
@@ -15,92 +15,17 @@
  */
 #pragma once
 
-/**
- * @defgroup logging levels used in raft
- *
- * @note exactly match the corresponding ones (but reverse in terms of value)
- *       in spdlog for wrapping purposes
- *
- * @{
- */
-#define RAFT_LEVEL_TRACE    6
-#define RAFT_LEVEL_DEBUG    5
-#define RAFT_LEVEL_INFO     4
-#define RAFT_LEVEL_WARN     3
-#define RAFT_LEVEL_ERROR    2
-#define RAFT_LEVEL_CRITICAL 1
-#define RAFT_LEVEL_OFF      0
-/** @} */
-
-#if !defined(RAFT_ACTIVE_LEVEL)
-#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_INFO
-#endif
-
-/**
- * @defgroup loggerMacros Helper macros for dealing with logging
- * @{
- */
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE_VEC(ptr, len)                                      \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    print_vector(#ptr, ptr, len, ss);                                     \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+#include <sstream>
+
+#if (RAFT_LOG_ACTIVE_LEVEL <= RAFT_LOG_LEVEL_TRACE)
+#define RAFT_LOG_TRACE_VEC(ptr, len)                                               \
+  do {                                                                             \
+    std::stringstream ss;                                                          \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);                      \
+    print_vector(#ptr, ptr, len, ss);                                              \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str());          \
+    RAFT_LOGGER_CALL(raft::default_logger(), raft::level_enum::trace, __VA_ARGS__) \
   } while (0)
 #else
 #define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
 #endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
-#define RAFT_LOG_DEBUG(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_DEBUG(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
-#define RAFT_LOG_INFO(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_INFO(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
-#define RAFT_LOG_WARN(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_WARN(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
-#define RAFT_LOG_ERROR(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_ERROR(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
-#define RAFT_LOG_CRITICAL(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
-#endif
-/** @} */
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
deleted file mode 100644
index e64a0db257..0000000000
--- a/cpp/include/raft/core/logger.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "logger-ext.hpp"
-#include "logger-macros.hpp"
-
-#if !defined(RAFT_COMPILED)
-#include "logger-inl.hpp"
-#endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 55184cc615..0e00ef571f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/logger-macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 388dd60f14..44d55c36de 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <raft/core/logger.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>                              // raft::resources
 #include <raft/distance/distance_types.hpp>                     // is_min_close, DistanceType
diff --git a/cpp/include/raft/solver/detail/lap_kernels.cuh b/cpp/include/raft/solver/detail/lap_kernels.cuh
index 383c3ab713..3c25852240 100644
--- a/cpp/include/raft/solver/detail/lap_kernels.cuh
+++ b/cpp/include/raft/solver/detail/lap_kernels.cuh
@@ -26,6 +26,7 @@
 
 #include "../linear_assignment_types.hpp"
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -552,4 +553,4 @@ RAFT_KERNEL kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
   }
 }
 
-}  // namespace raft::solver::detail
\ No newline at end of file
+}  // namespace raft::solver::detail
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 02a77a0d99..6f03f77bc0 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -24,7 +24,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
diff --git a/cpp/src/core/logger.cpp b/cpp/src/core/logger.cpp
deleted file mode 100644
index 8f81cf2926..0000000000
--- a/cpp/src/core/logger.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <raft/core/logger-inl.hpp>
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 621ee6c160..4cd0a32f51 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -55,6 +55,7 @@ function(ConfigureTest)
             ${RAFT_CTK_MATH_DEPENDENCIES}
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
+            raft_test_logger
   )
   set_target_properties(
     ${TEST_NAME}
@@ -87,6 +88,10 @@ function(ConfigureTest)
   )
 endfunction()
 
+# Create an object library for the logger so that we don't have to recompile it.
+add_library(raft_test_logger OBJECT)
+target_link_libraries(raft_test_logger PRIVATE raft_logger_impl)
+
 # ##################################################################################################
 # test sources ##################################################################################
 # ##################################################################################################
diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp
index c63d5896e5..007b57378f 100644
--- a/cpp/test/core/device_resources_manager.cpp
+++ b/cpp/test/core/device_resources_manager.cpp
@@ -89,7 +89,7 @@ TEST(DeviceResourcesManager, ObeysSetters)
 
   // Suppress the many warnings from testing use of setters after initial
   // get_device_resources call
-  auto scoped_log_level = log_level_setter{RAFT_LEVEL_ERROR};
+  auto scoped_log_level = log_level_setter{level_enum::error};
 
   omp_set_dynamic(0);
 #pragma omp parallel for num_threads(5)
diff --git a/cpp/test/core/logger.cpp b/cpp/test/core/logger.cpp
index 7f31beed71..10adb71dda 100644
--- a/cpp/test/core/logger.cpp
+++ b/cpp/test/core/logger.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-// We set RAFT_ACTIVE_LEVEL to a value that would enable testing trace and debug logs
+// We set RAFT_LOG_ACTIVE_LEVEL to a value that would enable testing trace and debug logs
 // (otherwise trace and debug logs are desabled by default).
-#undef RAFT_ACTIVE_LEVEL
-#define RAFT_ACTIVE_LEVEL 6
+#undef RAFT_LOG_ACTIVE_LEVEL
+#define RAFT_LOG_ACTIVE_LEVEL RAFT_LOG_LEVEL_TRACE
 
 #include <raft/core/logger.hpp>
 
@@ -34,15 +34,15 @@ TEST(logger, Test)
   RAFT_LOG_WARN("This is a warning message");
   RAFT_LOG_INFO("This is an info message");
 
-  logger::get(RAFT_NAME).set_level(RAFT_LEVEL_WARN);
-  ASSERT_EQ(RAFT_LEVEL_WARN, logger::get(RAFT_NAME).get_level());
-  logger::get(RAFT_NAME).set_level(RAFT_LEVEL_INFO);
-  ASSERT_EQ(RAFT_LEVEL_INFO, logger::get(RAFT_NAME).get_level());
+  default_logger().set_level(raft::level_enum::warn);
+  ASSERT_EQ(raft::level_enum::warn, default_logger().level());
+  default_logger().set_level(raft::level_enum::info);
+  ASSERT_EQ(raft::level_enum::info, default_logger().level());
 
-  ASSERT_FALSE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_TRACE));
-  ASSERT_FALSE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_DEBUG));
-  ASSERT_TRUE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_INFO));
-  ASSERT_TRUE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_WARN));
+  ASSERT_FALSE(default_logger().should_log(raft::level_enum::trace));
+  ASSERT_FALSE(default_logger().should_log(raft::level_enum::debug));
+  ASSERT_TRUE(default_logger().should_log(raft::level_enum::info));
+  ASSERT_TRUE(default_logger().should_log(raft::level_enum::warn));
 }
 
 std::string logged = "";
@@ -57,60 +57,61 @@ class loggerTest : public ::testing::Test {
   {
     flushCount = 0;
     logged     = "";
-    logger::get(RAFT_NAME).set_level(RAFT_LEVEL_TRACE);
+    default_logger().set_level(raft::level_enum::trace);
   }
 
   void TearDown() override
   {
-    logger::get(RAFT_NAME).set_callback(nullptr);
-    logger::get(RAFT_NAME).set_flush(nullptr);
-    logger::get(RAFT_NAME).set_level(RAFT_LEVEL_INFO);
+    default_logger().sinks().pop_back();
+    default_logger().set_level(raft::level_enum::info);
   }
 };
 
-// The logging macros depend on `RAFT_ACTIVE_LEVEL` as well as the logger verbosity;
-// The verbosity is set to `RAFT_LEVEL_TRACE`, but `RAFT_ACTIVE_LEVEL` is set outside of here.
-auto check_if_logged(const std::string& msg, int log_level_def) -> bool
+// The logging macros depend on `RAFT_LOG_ACTIVE_LEVEL` as well as the logger verbosity;
+// The verbosity is set to `RAFT_LOG_LEVEL_TRACE`, but `RAFT_LOG_ACTIVE_LEVEL` is set outside of
+// here.
+auto check_if_logged(const std::string& msg, raft::level_enum log_level_def) -> bool
 {
   bool actually_logged  = logged.find(msg) != std::string::npos;
-  bool should_be_logged = RAFT_ACTIVE_LEVEL >= log_level_def;
+  bool should_be_logged = RAFT_LOG_ACTIVE_LEVEL <= static_cast<int>(log_level_def);
   return actually_logged == should_be_logged;
 }
 
 TEST_F(loggerTest, callback)
 {
   std::string testMsg;
-  logger::get(RAFT_NAME).set_callback(exampleCallback);
+  default_logger().sinks().push_back(std::make_shared<callback_sink_mt>(exampleCallback));
 
   testMsg = "This is a critical message";
   RAFT_LOG_CRITICAL(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_CRITICAL));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::critical));
 
   testMsg = "This is an error message";
   RAFT_LOG_ERROR(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_ERROR));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::error));
 
   testMsg = "This is a warning message";
   RAFT_LOG_WARN(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_WARN));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::warn));
 
   testMsg = "This is an info message";
   RAFT_LOG_INFO(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_INFO));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::info));
 
   testMsg = "This is a debug message";
   RAFT_LOG_DEBUG(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_DEBUG));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::debug));
 
   testMsg = "This is a trace message";
   RAFT_LOG_TRACE(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_TRACE));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::trace));
 }
 
 TEST_F(loggerTest, flush)
 {
-  logger::get(RAFT_NAME).set_flush(exampleFlush);
-  logger::get(RAFT_NAME).flush();
+  default_logger().sinks().push_back(
+    std::make_shared<callback_sink_mt>(exampleCallback, exampleFlush));
+  default_logger().flush();
   ASSERT_EQ(1, flushCount);
 }
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 5cc694dc8f..6240b2638b 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -256,14 +256,14 @@ There are 7 logging levels with each successive level becoming quieter:
 7. RAFT_LEVEL_OFF
    Pass one of these as per your needs into the `set_level()` method as follows:
 ```cpp
-raft::logger::get().set_level(RAFT_LEVEL_WARN);
+raft::default_logger().set_level(RAFT_LEVEL_WARN);
 // From now onwards, this will print only WARN and above kind of messages
 ```
 
 ### Changing logging pattern
 Pass the [format string](https://github.com/gabime/spdlog/wiki/3.-Custom-formatting) as follows in order use a different logging pattern than the default.
 ```cpp
-raft::logger::get.set_pattern(YourFavoriteFormat);
+raft::default_logger().set_pattern(YourFavoriteFormat);
 ```
 One can also use the corresponding `get_pattern()` method to know the current format as well.
 

From 26f8d06c6699e50fa085763fc8f0d7a6d02c5ceb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 3 Jan 2025 11:29:49 -0800
Subject: [PATCH 15/37] Use rapids-cmake for the logger (#2534)

This PR switches raft to use rapids-cmake to fetch rapids-logger so that it uses a consistent version with the rest of RAPIDS to avoid any cases where transitive CPM loads result in multiple packages being built from source that require a different version of rapids-logger.

Depends on https://github.com/rapidsai/rapids-cmake/pull/737 and https://github.com/rapidsai/rmm/pull/1776.

Contributes to rapidsai/build-planning#104.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2534
---
 cpp/CMakeLists.txt | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 06531941aa..621f9fcef2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,11 +163,8 @@ include(cmake/modules/ConfigureCUDA.cmake)
 # add third party dependencies using CPM
 rapids_cpm_init()
 
-# Not using rapids-cmake since we never want to find, always download.
-CPMAddPackage(
-  NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW FALSE GIT_TAG
-  4df3ee70c6746fd1b6c0dc14209dae2e2d4378c6 VERSION 4df3ee70c6746fd1b6c0dc14209dae2e2d4378c6
-)
+include(${rapids-cmake-dir}/cpm/rapids_logger.cmake)
+rapids_cpm_rapids_logger()
 rapids_make_logger(raft LOGGER_HEADER_DIR include/raft/core EXPORT_SET raft-exports)
 
 # CCCL before rmm/cuco so we get the right version of CCCL

From 8fc988e11d82404ef7b52f4c810d4a4ed07cd2a2 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 7 Jan 2025 15:49:56 -0600
Subject: [PATCH 16/37] remove unused 'joblib' and 'numba' dependencies, other
 packaging cleanup (#2532)

Proposes some cleanup of packaging details, noticed while I was working on #2531

* removes runtime dependencies on `joblib` and `numba` for `raft-dask`
   - *`raft-dask` doesn't directly import from these libraries, and the git blame didn't suggest any other reason that they were being pinned here*
   - *checked with `git grep -E 'joblib|numba'`
* removes `setup.cfg` files
   - *these are currently being ignored by tools, in favor of identical configuration in `pyproject.toml` and `.flake8` files*
   - e.g. https://github.com/rapidsai/raft/blob/bfd190687ee396374b7106d9ac26add73b57b22a/.pre-commit-config.yaml#L16-L19
* packages license files in conda packages
  - *think these were just missed in the round of PRs like this: https://github.com/rapidsai/cuml/pull/6061*
* removes some outdated / inaccurate comments in packaging configs

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2532
---
 .pre-commit-config.yaml                       |  3 +-
 .../all_cuda-118_arch-aarch64.yaml            |  2 -
 .../all_cuda-118_arch-x86_64.yaml             |  2 -
 .../all_cuda-125_arch-aarch64.yaml            |  2 -
 .../all_cuda-125_arch-x86_64.yaml             |  2 -
 conda/recipes/pylibraft/meta.yaml             |  4 +-
 conda/recipes/raft-dask/meta.yaml             |  5 +-
 dependencies.yaml                             |  4 --
 pyproject.toml                                |  2 +-
 python/pylibraft/setup.cfg                    | 38 -------------
 python/raft-dask/pyproject.toml               |  2 -
 setup.cfg                                     | 55 -------------------
 12 files changed, 4 insertions(+), 117 deletions(-)
 delete mode 100644 python/pylibraft/setup.cfg
 delete mode 100644 setup.cfg

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e3b3c8c440..d5456ba30b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -110,8 +110,7 @@ repos:
                   [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
                   CMakeLists[.]txt$|
                   CMakeLists_standalone[.]txt$|
-                  meta[.]yaml$|
-                  setup[.]cfg$
+                  meta[.]yaml$
             exclude: |
               (?x)
                   cpp/include/raft/neighbors/detail/faiss_select/|
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index e145aeb92e..793ca8dc67 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -26,7 +26,6 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -38,7 +37,6 @@ dependencies:
 - libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 75dcffa95d..a9f839bd03 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,6 @@ dependencies:
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -38,7 +37,6 @@ dependencies:
 - libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index bfa32c80d1..9d7286bb8e 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -27,7 +27,6 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
@@ -35,7 +34,6 @@ dependencies:
 - libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - pre-commit
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 98ec334635..e4ec074ae5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -27,7 +27,6 @@ dependencies:
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
@@ -35,7 +34,6 @@ dependencies:
 - libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - pre-commit
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 4a8ed29c85..0b57432402 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -1,7 +1,5 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-# Usage:
-#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
@@ -81,5 +79,5 @@ tests:
 about:
   home: https://rapids.ai/
   license: Apache-2.0
-  # license_file: LICENSE
+  license_file: LICENSE
   summary: pylibraft library
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index a8be273f82..19155166af 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -1,7 +1,5 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-# Usage:
-#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
@@ -70,7 +68,6 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - dask-cuda ={{ minor_version }}
     - rapids-dask-dependency ={{ minor_version }}
-    - joblib >=0.11
     - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
@@ -87,5 +84,5 @@ tests:
 about:
   home: https://rapids.ai/
   license: Apache-2.0
-  # license_file: LICENSE
+  license_file: LICENSE
   summary: raft-dask library
diff --git a/dependencies.yaml b/dependencies.yaml
index dc1807fbf9..689cf8414c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -392,8 +392,6 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - dask-cuda==25.2.*,>=0.0.0a0
-          - joblib>=0.11
-          - numba>=0.57
           - rapids-dask-dependency==25.2.*,>=0.0.0a0
       - output_types: conda
         packages:
@@ -402,7 +400,6 @@ dependencies:
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for cudf and rmm.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -482,7 +479,6 @@ dependencies:
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
diff --git a/pyproject.toml b/pyproject.toml
index 5042113388..2f23debfbe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,6 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,numer"
+ignore-words-list = "inout,unparseable,numer"
 builtin = "clear"
 quiet-level = 3
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
deleted file mode 100644
index 7d1a0c9065..0000000000
--- a/python/pylibraft/setup.cfg
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    nvtext
-    cudf
-    cuml
-    cugraph
-    dask_cudf
-    rmm
-known_first_party=
-    raft
-    pylibraft
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 33643c481e..cabe8e72a6 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -33,8 +33,6 @@ requires-python = ">=3.10"
 dependencies = [
     "dask-cuda==25.2.*,>=0.0.0a0",
     "distributed-ucxx==0.42.*,>=0.0.0a0",
-    "joblib>=0.11",
-    "numba>=0.57",
     "pylibraft==25.2.*,>=0.0.0a0",
     "rapids-dask-dependency==25.2.*,>=0.0.0a0",
     "ucx-py==0.42.*,>=0.0.0a0",
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 94140d4d00..0000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-exclude = __init__.py, *.egg, build, docs, .git
-force-check = True
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
-
-[pydocstyle]
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
-# than include using match-dir. Note that as discussed in
-# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
-# unlike the match option above this match-dir will have no effect when
-# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
-# also be maintained in the pre-commit config file.
-match-dir = ^(?!(ci|cpp|conda|docs)).*$
-# Allow missing docstrings for docutils
-ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
-select =
-    D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418
-    # Would like to enable the following rules in the future:
-    # D200, D202, D205, D400
-
-[mypy]
-ignore_missing_imports = True
-# If we don't specify this, then mypy will check excluded files if
-# they are imported by a checked file.
-follow_imports = skip
-
-[codespell]
-# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
-# this is only to allow you to run codespell interactively
-skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild
-# ignore short words, and typename parameters like OffsetT
-ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b
-ignore-words-list = inout,unparseable,numer
-builtin = clear
-quiet-level = 3

From 1b62c4117a35b11ce3c830daae248e32ebf75e3f Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 10 Jan 2025 22:29:03 +0100
Subject: [PATCH 17/37] Fix lanczos solver integer overflow (#2536)

Partially answers https://github.com/rapidsai/cuml/issues/6204

Authors:
  - Victor Lafargue (https://github.com/viclafargue)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/raft/pull/2536
---
 cpp/include/raft/sparse/detail/coo.cuh        | 14 ++++++++-----
 .../raft/sparse/solver/detail/lanczos.cuh     | 21 +++++++++++--------
 .../raft/spectral/detail/matrix_wrappers.hpp  |  8 +++----
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index 91ba363168..9a38c11a07 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -182,7 +182,7 @@ class COO {
    * @param n_rows: number of rows in the dense matrix
    * @param n_cols: number of columns in the dense matrix
    */
-  void setSize(int n_rows, int n_cols)
+  void setSize(Index_Type n_rows, Index_Type n_cols)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
@@ -192,7 +192,7 @@ class COO {
    * @brief Set the number of rows and cols for a square dense matrix
    * @param n: number of rows and cols
    */
-  void setSize(int n)
+  void setSize(Index_Type n)
   {
     this->n_rows = n;
     this->n_cols = n;
@@ -204,7 +204,10 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
+  void allocate(Index_Type nnz, bool init, cudaStream_t stream)
+  {
+    this->allocate(nnz, 0, init, stream);
+  }
 
   /**
    * @brief Allocate the underlying arrays
@@ -213,7 +216,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream)
+  void allocate(Index_Type nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -226,7 +229,8 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
+  void allocate(
+    Index_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 6f03f77bc0..ddfa01731a 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -624,7 +624,7 @@ static int lanczosRestart(raft::resources const& handle,
   value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t* V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + (size_t)n * (size_t)iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -641,7 +641,7 @@ static int lanczosRestart(raft::resources const& handle,
   // std::cout <<std::endl;
 
   // Initialize similarity transform with identity matrix
-  memset(V_host, 0, iter * iter * sizeof(value_type_t));
+  memset(V_host, 0, (size_t)iter * (size_t)iter * (size_t)sizeof(value_type_t));
   for (i = 0; i < iter; ++i)
     V_host[IDX(i, i, iter)] = 1;
 
@@ -679,8 +679,11 @@ static int lanczosRestart(raft::resources const& handle,
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(V_dev,
+                                V_host,
+                                (size_t)iter * (size_t)iter * (size_t)sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
 
   beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h,
@@ -716,7 +719,7 @@ static int lanczosRestart(raft::resources const& handle,
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
                                 work_dev,
-                                n * iter_new * sizeof(value_type_t),
+                                (size_t)n * (size_t)iter_new * (size_t)sizeof(value_type_t),
                                 cudaMemcpyDeviceToDevice,
                                 stream));
 
@@ -1045,10 +1048,10 @@ int computeSmallestEigenvectors(
   unsigned long long seed = 1234567)
 {
   // Matrix dimension
-  index_type_t n = A.nrows_;
+  size_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && (size_t)nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1395,10 +1398,10 @@ int computeLargestEigenvectors(
   unsigned long long seed = 123456)
 {
   // Matrix dimension
-  index_type_t n = A.nrows_;
+  size_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && (size_t)nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 1fe078bd32..db8a5dc9ef 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -39,14 +39,14 @@
 // =========================================================
 
 // Get index of matrix entry
-#define IDX(i, j, lda) ((i) + (j) * (lda))
+#define IDX(i, j, lda) ((size_t)(i) + (j) * (lda))
 
 namespace raft {
 namespace spectral {
 namespace matrix {
 namespace detail {
 
-using size_type = int;  // for now; TODO: move it in appropriate header
+using size_type = size_t;  // for now; TODO: move it in appropriate header
 
 // Apply diagonal matrix to vector:
 //
@@ -326,7 +326,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
         raft_handle, row_offsets, col_indices, values, nrows, nnz),
       diagonal_(raft_handle, nrows)
   {
-    vector_t<value_type> ones{raft_handle, nrows};
+    vector_t<value_type> ones{raft_handle, (size_t)nrows};
     ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
@@ -341,7 +341,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
                                               csr_m.nnz_),
       diagonal_(raft_handle, csr_m.nrows_)
   {
-    vector_t<value_type> ones{raft_handle, csr_m.nrows_};
+    vector_t<value_type> ones{raft_handle, (size_t)csr_m.nrows_};
     ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }

From 5c826d7320486852c30a18f6e039d0cda83c5c62 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Tue, 14 Jan 2025 00:34:20 +0100
Subject: [PATCH 18/37] Add support for different data type of bitset (#2535)

This PR is useful for Milvus.
Previously the `bitset_view` object only supported the data type used to create the bitset. With the proposed modifications, a `bitset_view` object can support any data type used to create the bitset by specifying the `original_nbits` parameter in the class constructor.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - rhdong (https://github.com/rhdong)

URL: https://github.com/rapidsai/raft/pull/2535
---
 cpp/include/raft/core/bitmap.hpp | 24 ++++++--
 cpp/include/raft/core/bitset.cuh | 53 ++++++++++++++---
 cpp/include/raft/core/bitset.hpp | 34 +++++++++--
 cpp/test/core/bitset.cu          | 98 ++++++++++++++++++++++++++++++--
 4 files changed, 188 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
index 86b2d77478..5a6656f572 100644
--- a/cpp/include/raft/core/bitmap.hpp
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -53,9 +53,18 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    * @param bitmap_ptr Device raw pointer
    * @param rows Number of row in the matrix.
    * @param cols Number of col in the matrix.
+   * @param original_nbits Original number of bits used when the bitmap was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitmap was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr,
+                                index_t rows,
+                                index_t cols,
+                                index_t original_nbits = 0)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols, original_nbits),
+      rows_(rows),
+      cols_(cols)
   {
   }
 
@@ -65,11 +74,18 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    * @param bitmap_span Device vector view of the bitmap
    * @param rows Number of row in the matrix.
    * @param cols Number of col in the matrix.
+   * @param original_nbits Original number of bits used when the bitmap was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitmap was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
   _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
                                 index_t rows,
-                                index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
+                                index_t cols,
+                                index_t original_nbits = 0)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols, original_nbits),
+      rows_(rows),
+      cols_(cols)
   {
   }
 
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index d1bffdb81e..feaef1a172 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -32,12 +32,41 @@
 
 namespace raft::core {
 
+template <typename index_t>
+_RAFT_HOST_DEVICE void inline compute_original_nbits_position(const index_t original_nbits,
+                                                              const index_t nbits,
+                                                              const index_t sample_index,
+                                                              index_t& new_bit_index,
+                                                              index_t& new_bit_offset)
+{
+  const index_t original_bit_index  = sample_index / original_nbits;
+  const index_t original_bit_offset = sample_index % original_nbits;
+  new_bit_index                     = original_bit_index * original_nbits / nbits;
+  new_bit_offset                    = 0;
+  if (original_nbits > nbits) {
+    new_bit_index += original_bit_offset / nbits;
+    new_bit_offset = original_bit_offset % nbits;
+  } else {
+    index_t ratio = nbits / original_nbits;
+    new_bit_offset += (original_bit_index % ratio) * original_nbits;
+    new_bit_offset += original_bit_offset % nbits;
+  }
+}
+
 template <typename bitset_t, typename index_t>
 _RAFT_HOST_DEVICE inline bool bitset_view<bitset_t, index_t>::test(const index_t sample_index) const
 {
-  const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
-  const index_t bit_index    = sample_index % bitset_element_size;
-  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+  const index_t nbits = sizeof(bitset_t) * 8;
+  index_t bit_index   = 0;
+  index_t bit_offset  = 0;
+  if (original_nbits_ == 0 || nbits == original_nbits_) {
+    bit_index  = sample_index / bitset_element_size;
+    bit_offset = sample_index % bitset_element_size;
+  } else {
+    compute_original_nbits_position(original_nbits_, nbits, sample_index, bit_index, bit_offset);
+  }
+  const bitset_t bit_element = bitset_ptr_[bit_index];
+  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_offset)) != 0;
   return is_bit_set;
 }
 
@@ -51,14 +80,22 @@ template <typename bitset_t, typename index_t>
 _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
                                                       bool set_value) const
 {
-  const index_t bit_element = sample_index / bitset_element_size;
-  const index_t bit_index   = sample_index % bitset_element_size;
-  const bitset_t bitmask    = bitset_t{1} << bit_index;
+  const index_t nbits = sizeof(bitset_t) * 8;
+  index_t bit_index   = 0;
+  index_t bit_offset  = 0;
+
+  if (original_nbits_ == 0 || nbits == original_nbits_) {
+    bit_index  = sample_index / bitset_element_size;
+    bit_offset = sample_index % bitset_element_size;
+  } else {
+    compute_original_nbits_position(original_nbits_, nbits, sample_index, bit_index, bit_offset);
+  }
+  const bitset_t bitmask = bitset_t{1} << bit_offset;
   if (set_value) {
-    atomicOr(bitset_ptr_ + bit_element, bitmask);
+    atomicOr(bitset_ptr_ + bit_index, bitmask);
   } else {
     const bitset_t bitmask2 = ~bitmask;
-    atomicAnd(bitset_ptr_ + bit_element, bitmask2);
+    atomicAnd(bitset_ptr_ + bit_index, bitmask2);
   }
 }
 
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index be828def87..e4bea2c0c5 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -42,8 +42,20 @@ template <typename bitset_t = uint32_t, typename index_t = uint32_t>
 struct bitset_view {
   static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
 
-  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
-    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  /**
+   * @brief Create a bitset view from a device pointer to the bitset.
+   *
+   * @param bitset_ptr Device pointer to the bitset
+   * @param bitset_len Number of bits in the bitset
+   * @param original_nbits Original number of bits used when the bitset was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitset was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
+   */
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr,
+                                index_t bitset_len,
+                                index_t original_nbits = 0)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}, original_nbits_{original_nbits}
   {
   }
   /**
@@ -51,10 +63,17 @@ struct bitset_view {
    *
    * @param bitset_span Device vector view of the bitset
    * @param bitset_len Number of bits in the bitset
+   * @param original_nbits Original number of bits used when the bitset was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitset was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
   _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
-                                index_t bitset_len)
-    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+                                index_t bitset_len,
+                                index_t original_nbits = 0)
+    : bitset_ptr_{bitset_span.data_handle()},
+      bitset_len_{bitset_len},
+      original_nbits_{original_nbits}
   {
   }
   /**
@@ -180,9 +199,16 @@ struct bitset_view {
     return (bitset_len + bits_per_element - 1) / bits_per_element;
   }
 
+  /**
+   * @brief Get the original number of bits of the bitset.
+   */
+  auto get_original_nbits() const -> index_t { return original_nbits_; }
+  void set_original_nbits(index_t original_nbits) { original_nbits_ = original_nbits; }
+
  private:
   bitset_t* bitset_ptr_;
   index_t bitset_len_;
+  index_t original_nbits_;
 };
 
 /**
diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu
index ac601274c1..f094f60ded 100644
--- a/cpp/test/core/bitset.cu
+++ b/cpp/test/core/bitset.cu
@@ -24,6 +24,8 @@
 #include <gtest/gtest.h>
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <numeric>
 
 namespace raft::core {
@@ -73,6 +75,40 @@ void test_cpu_bitset(const std::vector<bitset_t>& bitset,
   }
 }
 
+template <typename bitset_t, typename index_t>
+void test_cpu_bitset_nbits(const bitset_t* bitset,
+                           const std::vector<index_t>& queries,
+                           std::vector<uint8_t>& result,
+                           unsigned original_nbits_)
+{
+  constexpr size_t nbits = sizeof(bitset_t) * 8;
+  if (original_nbits_ == nbits) {
+    for (size_t i = 0; i < queries.size(); i++) {
+      result[i] =
+        uint8_t((bitset[queries[i] / nbits] & (bitset_t{1} << (queries[i] % nbits))) != 0);
+    }
+  }
+  for (size_t i = 0; i < queries.size(); i++) {
+    const index_t sample_index        = queries[i];
+    const index_t original_bit_index  = sample_index / original_nbits_;
+    const index_t original_bit_offset = sample_index % original_nbits_;
+    index_t new_bit_index             = original_bit_index * original_nbits_ / nbits;
+    index_t new_bit_offset            = 0;
+    if (original_nbits_ > nbits) {
+      new_bit_index += original_bit_offset / nbits;
+      new_bit_offset = original_bit_offset % nbits;
+    } else {
+      index_t ratio = nbits / original_nbits_;
+      new_bit_offset += (original_bit_index % ratio) * original_nbits_;
+      new_bit_offset += original_bit_offset % nbits;
+    }
+    const bitset_t bit_element = bitset[new_bit_index];
+    const bool is_bit_set      = (bit_element & (bitset_t{1} << new_bit_offset)) != 0;
+
+    result[i] = uint8_t(is_bit_set);
+  }
+}
+
 template <typename bitset_t>
 void flip_cpu_bitset(std::vector<bitset_t>& bitset)
 {
@@ -168,11 +204,12 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
-    auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
-    auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
-    auto query_cpu     = std::vector<index_t>(spec.query_len);
-    auto result_cpu    = std::vector<uint8_t>(spec.query_len);
-    auto result_ref    = std::vector<uint8_t>(spec.query_len);
+    auto query_device     = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
+    auto result_device    = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
+    auto query_cpu        = std::vector<index_t>(spec.query_len);
+    auto result_cpu       = std::vector<uint8_t>(spec.query_len);
+    auto result_ref_nbits = std::vector<uint8_t>(spec.query_len);
+    auto result_ref       = std::vector<uint8_t>(spec.query_len);
 
     // Create queries and verify the test results
     raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
@@ -194,6 +231,57 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
+    // Reinterpret the bitset as uint8_t, uint32 then uint64_t
+    {
+      // Test CPU logic
+      test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+      uint8_t* bitset_cpu_uint8 = (uint8_t*)std::malloc(sizeof(bitset_t) * bitset_ref.size());
+      std::memcpy(bitset_cpu_uint8, bitset_ref.data(), sizeof(bitset_t) * bitset_ref.size());
+      test_cpu_bitset_nbits(bitset_cpu_uint8, query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
+      std::free(bitset_cpu_uint8);
+
+      // Test GPU uint8_t, uint32_t, uint64_t
+      auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
+        reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint8_t] __device__(index_t query) {
+          return my_bitset_view_uint8_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+
+      auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
+        reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint32_t] __device__(index_t query) {
+          return my_bitset_view_uint32_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+
+      auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
+        reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint64_t] __device__(index_t query) {
+          return my_bitset_view_uint64_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+    }
+
     // test sparsity, repeat and eval_n_elements
     {
       auto my_bitset_view  = my_bitset.view();

From 32e918b37809d132fb71faefa7e06b7eea43565d Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 15 Jan 2025 07:38:33 -0800
Subject: [PATCH 19/37] Add missing `#include <cstdint>` (#2540)

This is needed to define `uint64_t` later on.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/raft/pull/2540
---
 cpp/include/raft/util/integer_utils.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index 5224d5ac4c..7ea5a3d212 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -25,6 +25,7 @@
 
 #include <raft/core/detail/macros.hpp>
 
+#include <cstdint>
 #include <limits>
 #include <stdexcept>
 #include <type_traits>

From a7f191639575f5bfcccf48d670ed33a34860f763 Mon Sep 17 00:00:00 2001
From: rhdong <rhdong2017@gmail.com>
Date: Wed, 15 Jan 2025 08:04:55 -0800
Subject: [PATCH 20/37] [Feat] Support `bitset_to_csr` (#2523)

This API, `bitset_to_csr,` will be utilized to implement the `bitset'- based filter for prefiltered Brute Force.

Authors:
  - rhdong (https://github.com/rhdong)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/raft/pull/2523
---
 cpp/bench/prims/linalg/masked_matmul.cu       | 111 ++++---
 cpp/bench/prims/sparse/bitset_to_csr.cu       | 178 +++++++++++
 cpp/include/raft/core/bitmap.cuh              |   8 +
 cpp/include/raft/core/bitmap.hpp              |  20 ++
 cpp/include/raft/core/bitset.cuh              |   8 +
 cpp/include/raft/core/bitset.hpp              |  63 ++++
 cpp/include/raft/sparse/convert/csr.cuh       |  78 ++++-
 .../sparse/convert/detail/bitmap_to_csr.cuh   |  12 +-
 .../sparse/convert/detail/bitset_to_csr.cuh   | 184 +++++++++++
 .../sparse/linalg/detail/masked_matmul.cuh    |  66 +++-
 .../raft/sparse/linalg/masked_matmul.cuh      | 117 +++++++
 .../raft/sparse/linalg/masked_matmul.hpp      |  61 +---
 cpp/test/sparse/convert_csr.cu                | 289 +++++++++++++++++-
 cpp/test/sparse/masked_matmul.cu              | 130 ++++++--
 14 files changed, 1199 insertions(+), 126 deletions(-)
 create mode 100644 cpp/bench/prims/sparse/bitset_to_csr.cu
 create mode 100644 cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/masked_matmul.cuh

diff --git a/cpp/bench/prims/linalg/masked_matmul.cu b/cpp/bench/prims/linalg/masked_matmul.cu
index eda9cb1710..b96e14a25d 100644
--- a/cpp/bench/prims/linalg/masked_matmul.cu
+++ b/cpp/bench/prims/linalg/masked_matmul.cu
@@ -22,7 +22,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng.cuh>
-#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/sparse/linalg/masked_matmul.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <cusparse_v2.h>
@@ -49,11 +49,14 @@ inline auto operator<<(std::ostream& os, const MaskedMatmulBenchParams<value_t>&
 {
   os << " m*k*n=" << params.m << "*" << params.k << "*" << params.n
      << "\tsparsity=" << params.sparsity;
-  if (params.sparsity == 1.0) { os << "<-inner product for comparison"; }
+  if (params.sparsity == 0.0) { os << "<-inner product for comparison"; }
   return os;
 }
 
-template <typename value_t, typename index_t = int64_t, typename bitmap_t = uint32_t>
+template <typename value_t,
+          bool bitmap_or_bitset = true,
+          typename index_t      = int64_t,
+          typename bits_t       = uint32_t>
 struct MaskedMatmulBench : public fixture {
   MaskedMatmulBench(const MaskedMatmulBenchParams<value_t>& p)
     : fixture(true),
@@ -64,15 +67,15 @@ struct MaskedMatmulBench : public fixture {
       c_indptr_d(0, stream),
       c_indices_d(0, stream),
       c_data_d(0, stream),
-      bitmap_d(0, stream),
+      bits_d(0, stream),
       c_dense_data_d(0, stream)
   {
-    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bitmap_t) * 8));
-    std::vector<bitmap_t> bitmap_h(element);
+    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
 
     a_data_d.resize(params.m * params.k, stream);
     b_data_d.resize(params.k * params.n, stream);
-    bitmap_d.resize(element, stream);
+    bits_d.resize(element, stream);
 
     raft::random::RngState rng(2024ULL);
     raft::random::uniform(
@@ -82,7 +85,13 @@ struct MaskedMatmulBench : public fixture {
 
     std::vector<bool> c_dense_data_h(params.m * params.n);
 
-    c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+    if constexpr (bitmap_or_bitset) {
+      c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
+    } else {
+      c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
+      repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
+      c_true_nnz *= params.m;
+    }
 
     std::vector<value_t> values(c_true_nnz);
     std::vector<index_t> indices(c_true_nnz);
@@ -93,24 +102,49 @@ struct MaskedMatmulBench : public fixture {
     c_indices_d.resize(c_true_nnz, stream);
     c_dense_data_d.resize(params.m * params.n, stream);
 
-    cpu_convert_to_csr(bitmap_h, params.m, params.n, indices, indptr);
+    cpu_convert_to_csr(bits_h, params.m, params.n, indices, indptr);
     RAFT_EXPECTS(c_true_nnz == c_indices_d.size(),
                  "Something wrong. The c_true_nnz != c_indices_d.size()!");
 
     update_device(c_data_d.data(), values.data(), c_true_nnz, stream);
     update_device(c_indices_d.data(), indices.data(), c_true_nnz, stream);
     update_device(c_indptr_d.data(), indptr.data(), params.m + 1, stream);
-    update_device(bitmap_d.data(), bitmap_h.data(), element, stream);
+    update_device(bits_d.data(), bits_h.data(), element, stream);
+  }
+
+  void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
+  {
+    size_t output_bit_index = input_bits;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bits_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bits_t) * 8);
+        bool bit                = (inout[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);
+
+        inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
   }
 
-  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
-    for (auto& item : bitmap) {
-      item = static_cast<bitmap_t>(0);
+    if (sparsity == 0.0f) {
+      std::fill(bits.begin(), bits.end(), 0xffffffff);
+      return num_ones;
+    }
+
+    for (auto& item : bits) {
+      item = static_cast<bits_t>(0);
     }
 
     std::random_device rd;
@@ -120,8 +154,8 @@ struct MaskedMatmulBench : public fixture {
     while (num_ones > 0) {
       index_t index = dis(gen);
 
-      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
-      index_t bit_position = index % (8 * sizeof(bitmap_t));
+      bits_t& element      = bits[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
         element |= (static_cast<index_t>(1) << bit_position);
@@ -131,7 +165,7 @@ struct MaskedMatmulBench : public fixture {
     return res;
   }
 
-  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+  void cpu_convert_to_csr(std::vector<bits_t>& bits,
                           index_t rows,
                           index_t cols,
                           std::vector<index_t>& indices,
@@ -142,14 +176,14 @@ struct MaskedMatmulBench : public fixture {
     indptr[offset_indptr++] = 0;
 
     index_t index        = 0;
-    bitmap_t element     = 0;
+    bits_t element       = 0;
     index_t bit_position = 0;
 
     for (index_t i = 0; i < rows; ++i) {
       for (index_t j = 0; j < cols; ++j) {
         index        = i * cols + j;
-        element      = bitmap[index / (8 * sizeof(bitmap_t))];
-        bit_position = index % (8 * sizeof(bitmap_t));
+        element      = bits[index / (8 * sizeof(bits_t))];
+        bit_position = index % (8 * sizeof(bits_t));
 
         if (((element >> bit_position) & 1)) {
           indices[offset_values] = static_cast<index_t>(j);
@@ -181,13 +215,17 @@ struct MaskedMatmulBench : public fixture {
       params.n,
       static_cast<index_t>(c_indices_d.size()));
 
-    auto mask =
-      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
-
     auto c = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);
 
-    if (params.sparsity < 1.0) {
-      raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    if (params.sparsity > 0.0) {
+      if constexpr (bitmap_or_bitset) {
+        auto mask =
+          raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      } else {
+        auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      }
     } else {
       raft::distance::pairwise_distance(handle,
                                         a_data_d.data(),
@@ -201,12 +239,16 @@ struct MaskedMatmulBench : public fixture {
     }
     resource::sync_stream(handle);
 
-    raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
-    resource::sync_stream(handle);
-
-    loop_on_state(state, [this, &a, &b, &mask, &c]() {
-      if (params.sparsity < 1.0) {
-        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    loop_on_state(state, [this, &a, &b, &c]() {
+      if (params.sparsity > 0.0) {
+        if constexpr (bitmap_or_bitset) {
+          auto mask =
+            raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+          raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+        } else {
+          auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+          raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+        }
       } else {
         raft::distance::pairwise_distance(handle,
                                           a_data_d.data(),
@@ -228,7 +270,7 @@ struct MaskedMatmulBench : public fixture {
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
-  rmm::device_uvector<bitmap_t> bitmap_d;
+  rmm::device_uvector<bits_t> bits_d;
 
   rmm::device_uvector<value_t> c_dense_data_d;
 
@@ -253,7 +295,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
     raft::util::itertools::product<TestParams>({size_t(10), size_t(1024)},
                                                {size_t(128), size_t(1024)},
                                                {size_t(1024 * 1024)},
-                                               {0.01f, 0.1f, 0.2f, 0.5f, 1.0f});
+                                               {0.99f, 0.9f, 0.8f, 0.5f, 0.0f});
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
@@ -263,6 +305,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
   return param_vec;
 }
 
-RAFT_BENCH_REGISTER((MaskedMatmulBench<float>), "", getInputs<float>());
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float, true>), "", getInputs<float>());
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float, false>), "", getInputs<float>());
 
 }  // namespace raft::bench::linalg
diff --git a/cpp/bench/prims/sparse/bitset_to_csr.cu b/cpp/bench/prims/sparse/bitset_to_csr.cu
new file mode 100644
index 0000000000..fef2d44d3e
--- /dev/null
+++ b/cpp/bench/prims/sparse/bitset_to_csr.cu
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <sstream>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_repeat;
+  index_t n_cols;
+  float sparsity;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << " rows*cols=" << params.n_repeat << "*" << params.n_cols
+     << "\tsparsity=" << params.sparsity;
+  return os;
+}
+
+template <typename bitset_t, typename index_t, typename value_t = float>
+struct BitsetToCsrBench : public fixture {
+  BitsetToCsrBench(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      bitset_d(0, stream),
+      nnz(0),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      values_d(0, stream)
+  {
+    index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> bitset_h(element);
+    nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);
+
+    bitset_d.resize(bitset_h.size(), stream);
+    indptr_d.resize(params.n_repeat + 1, stream);
+    indices_d.resize(nnz, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
+    index_t res      = num_ones;
+
+    for (auto& item : bitset) {
+      item = static_cast<bitset_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitset_t& element    = bitset[index / (8 * sizeof(bitset_t))];
+      index_t bit_position = index % (8 * sizeof(bitset_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), 1 * params.n_cols);
+
+    auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+    auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
+
+    raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
+
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &bitset, &csr]() {
+      raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+
+  rmm::device_uvector<bitset_t> bitset_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<value_t> values_d;
+
+  index_t nnz;
+};  // struct BitsetToCsrBench
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+template <typename index_t = int64_t>
+const std::vector<bench_param<index_t>> getLargeInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int, float>), "", getInputs<int>());
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint64_t, int, double>), "", getInputs<int>());
+
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index 024b1244a6..b2c9df436f 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -22,6 +22,7 @@
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/sparse/convert/csr.cuh>
 
 #include <type_traits>
 
@@ -42,4 +43,11 @@ _RAFT_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
   set(row * cols_ + col, new_value);
 }
 
+template <typename bitmap_t, typename index_t>
+template <typename csr_matrix_t>
+void bitmap_view<bitmap_t, index_t>::to_csr(const raft::resources& res, csr_matrix_t& csr) const
+{
+  raft::sparse::convert::bitmap_to_csr(res, *this, csr);
+}
+
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
index 5a6656f572..be305152e8 100644
--- a/cpp/include/raft/core/bitmap.hpp
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -133,6 +133,26 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    */
   inline _RAFT_HOST_DEVICE index_t get_n_cols() const { return cols_; }
 
+  /**
+   * @brief Converts to a Compressed Sparse Row (CSR) format matrix.
+   *
+   * This method transforms a two-dimensional bitmap matrix into a CSR representation,
+   * where each '1' bit in the bitmap corresponds to a non-zero entry in the CSR matrix.
+   * The bitmap is interpreted as a row-major matrix, with rows and columns defined by
+   * the dimensions of the bitmap.
+   *
+   * @tparam csr_matrix_t Specifies the CSR matrix type, constrained to raft::device_csr_matrix.
+   *
+   * @param[in] res RAFT resources for managing CUDA streams and execution policies.
+   * @param[out] csr Output parameter where the resulting CSR matrix is stored. Each '1' bit in
+   * the bitmap corresponds to a non-zero element in the CSR matrix.
+   *
+   * The caller must ensure that: The `csr` matrix is pre-allocated with dimensions and non-zero
+   * count matching the expected output.
+   */
+  template <typename csr_matrix_t>
+  void to_csr(const raft::resources& res, csr_matrix_t& csr) const;
+
  private:
   index_t rows_;
   index_t cols_;
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index feaef1a172..24ef3148b8 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -23,6 +23,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/reduce.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/popc.cuh>
 
@@ -202,6 +203,13 @@ double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) cons
   return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
 }
 
+template <typename bitset_t, typename index_t>
+template <typename csr_matrix_t>
+void bitset_view<bitset_t, index_t>::to_csr(const raft::resources& res, csr_matrix_t& csr) const
+{
+  raft::sparse::convert::bitset_to_csr(res, *this, csr);
+}
+
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   raft::device_vector_view<const index_t, index_t> mask_index,
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index e4bea2c0c5..94113822fb 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -205,6 +205,69 @@ struct bitset_view {
   auto get_original_nbits() const -> index_t { return original_nbits_; }
   void set_original_nbits(index_t original_nbits) { original_nbits_ = original_nbits; }
 
+  /**
+   * @brief Converts to a Compressed Sparse Row (CSR) format matrix.
+   *
+   * This method transforms the bitset view into a CSR matrix representation, where each '1' bit in
+   * the bitset corresponds to a non-zero entry in the CSR matrix. The bitset format supports
+   * only a single-row matrix, so if the CSR matrix requires multiple rows, the bitset data is
+   * repeated for each row in the output.
+   *
+   * Example usage:
+   *
+   * @code{.cpp}
+   * #include <raft/core/resource/cuda_stream.hpp>
+   * #include <raft/sparse/convert/csr.cuh>
+   * #include <rmm/device_uvector.hpp>
+   *
+   * using bitset_t = uint32_t;
+   * using index_t  = int;
+   * using value_t  = float;
+   *
+   * raft::resources handle;
+   * auto stream    = resource::get_cuda_stream(handle);
+   * index_t n_rows = 3;
+   * index_t n_cols = 30;
+   *
+   * // Compute bitset size and initialize device memory
+   * index_t bitset_size = (n_cols + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+   * rmm::device_uvector<bitset_t> bitset_d(bitset_size, stream);
+   * std::vector<bitset_t> bitset_h = {
+   *   bitset_t(0b11001010),
+   * };  // Example bitset, with 4 non-zero entries.
+   *
+   * raft::copy(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+   *
+   * // Create bitset view and CSR matrix
+   * auto bitset_view = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), n_cols);
+   * auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, n_rows, n_cols, 4 * n_rows);
+   *
+   * // Convert bitset to CSR
+   * bitset_view.to_csr(handle, csr);
+   * resource::sync_stream(handle);
+   *
+   * // Results:
+   * // csr.indptr  = [0, 4, 8, 12];
+   * // csr.indices = [1, 3, 6, 7,
+   * //                1, 3, 6, 7,
+   * //                1, 3, 6, 7];
+   * // csr.values  = [1, 1, 1, 1,
+   * //                1, 1, 1, 1,
+   * //                1, 1, 1, 1];
+   * @endcode
+   *
+   * @tparam csr_matrix_t Specifies the CSR matrix type, constrained to raft::device_csr_matrix.
+   *
+   * @param[in] res RAFT resources for managing CUDA streams and execution policies.
+   * @param[out] csr Output parameter where the resulting CSR matrix is stored. Each '1' bit in
+   * the bitset corresponds to a non-zero element in the CSR matrix.
+   *
+   * The caller must ensure that: The `csr` matrix is pre-allocated with dimensions and non-zero
+   * count matching the expected output, i.e., `nnz_for_csr = nnz_for_bitset * n_rows`.
+   */
+  template <typename csr_matrix_t>
+  void to_csr(const raft::resources& res, csr_matrix_t& csr) const;
+
  private:
   bitset_t* bitset_ptr_;
   index_t bitset_len_;
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 081192ed44..73d099a719 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -18,10 +18,12 @@
 
 #pragma once
 
-#include <raft/core/bitmap.cuh>
+#include <raft/core/bitmap.hpp>
+#include <raft/core/bitset.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
 #include <raft/sparse/convert/detail/bitmap_to_csr.cuh>
+#include <raft/sparse/convert/detail/bitset_to_csr.cuh>
 #include <raft/sparse/convert/detail/csr.cuh>
 #include <raft/sparse/csr.hpp>
 
@@ -129,6 +131,80 @@ void bitmap_to_csr(raft::resources const& handle,
   detail::bitmap_to_csr(handle, bitmap, csr);
 }
 
+/**
+ * @brief  Converts a bitset matrix to a Compressed Sparse Row (CSR) format matrix.
+ *
+ * The bitset format inherently supports only a single-row matrix (rows=1). If the CSR matrix
+ * requires multiple rows, the data from the bitset will be repeated for each row in the output.
+ *
+ * Example usage:
+ *
+ * @code{.cpp}
+ * #include <raft/core/resource/cuda_stream.hpp>
+ * #include <raft/sparse/convert/csr.cuh>
+ * #include <rmm/device_uvector.hpp>
+ *
+ * #include <vector>
+ *
+ * using bitset_t = uint32_t;
+ * using index_t  = int;
+ * using value_t  = float;
+ * using nnz_t    = index_t;
+ *
+ * raft::resources handle;
+ * auto stream    = resource::get_cuda_stream(handle);
+ * index_t n_rows = 3;
+ * index_t n_cols = 30;
+ *
+ * nnz_t nnz_for_bitset = 4;
+ * nnz_t nnz_for_csr    = nnz_for_bitset * n_rows;
+ *
+ * index_t bitset_size = (n_cols + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);  //  = 1
+ *
+ * rmm::device_uvector<bitset_t> bitset_d(bitset_size, stream);
+ * std::vector<bitset_t> bitset_h = {
+ *   bitset_t(0b11001010),
+ * };  // nnz_for_bitset = 4;
+ *
+ * raft::copy(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+ *
+ * auto bitset_view = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), n_cols);
+ * auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, n_rows, n_cols, nnz_for_csr);
+ *
+ * raft::sparse::convert::bitset_to_csr(handle, bitset_view, csr);
+ * resource::sync_stream(handle);
+ *
+ * // Results:
+ * // csr.indptr  = [0, 4, 8, 12];
+ * // csr.indices = [1, 3, 6, 7,
+ * //                1, 3, 6, 7,
+ * //                1, 3, 6, 7];
+ * // csr.values  = [1, 1, 1, 1,
+ * //                1, 1, 1, 1,
+ * //                1, 1, 1, 1];
+ * @endcode
+ *
+ * @tparam       bitset_t       The data type of the elements in the bitset matrix.
+ * @tparam       index_t        The data type used for indexing the elements in the matrices.
+ * @tparam       csr_matrix_t   Specifies the CSR matrix type, constrained to
+ * raft::device_csr_matrix.
+ *
+ * @param[in]    handle         The RAFT handle containing the CUDA stream for operations.
+ * @param[in]    bitset         The bitset matrix view, to be converted to CSR format.
+ * @param[out]   csr            Output parameter where the resulting CSR matrix is stored. In the
+ * bitset, each '1' bit corresponds to a non-zero element in the CSR matrix.
+ */
+template <typename bitset_t,
+          typename index_t,
+          typename csr_matrix_t,
+          typename = std::enable_if_t<raft::is_device_csr_matrix_v<csr_matrix_t>>>
+void bitset_to_csr(raft::resources const& handle,
+                   raft::core::bitset_view<bitset_t, index_t> bitset,
+                   csr_matrix_t& csr)
+{
+  detail::bitset_to_csr(handle, bitset, csr);
+}
+
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index 866923d647..be62f76502 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -283,10 +283,6 @@ void bitmap_to_csr(raft::resources const& handle,
   using nnz_t   = typename csr_matrix_t::nnz_type;
   auto csr_view = csr.structure_view();
 
-  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0 || csr_view.get_nnz() == 0) {
-    return;
-  }
-
   RAFT_EXPECTS(bitmap.get_n_rows() == csr_view.get_n_rows(),
                "Number of rows in bitmap must be equal to "
                "number of rows in csr");
@@ -295,6 +291,8 @@ void bitmap_to_csr(raft::resources const& handle,
                "Number of columns in bitmap must be equal to "
                "number of columns in csr");
 
+  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0) { return; }
+
   auto thrust_policy = resource::get_thrust_policy(handle);
   auto stream        = resource::get_cuda_stream(handle);
 
@@ -330,12 +328,14 @@ void bitmap_to_csr(raft::resources const& handle,
     thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
 
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
-    index_t nnz = 0;
+    nnz_t nnz = 0;
     RAFT_CUDA_TRY(cudaMemcpyAsync(
-      &nnz, sub_nnz.data() + sub_nnz_size, sizeof(index_t), cudaMemcpyDeviceToHost, stream));
+      &nnz, sub_nnz.data() + sub_nnz_size, sizeof(nnz_t), cudaMemcpyDeviceToHost, stream));
     resource::sync_stream(handle);
     csr.initialize_sparsity(nnz);
+    if (nnz == 0) return;
   }
+
   constexpr bool check_nnz = is_device_csr_sparsity_preserving_v<csr_matrix_t>;
   fill_indices_by_rows<bitmap_t, index_t, nnz_t, check_nnz>(handle,
                                                             bitmap.data(),
diff --git a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
new file mode 100644
index 0000000000..b3b341d793
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/detail/mdspan_util.cuh>  // detail::popc
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/detail/adj_to_csr.cuh>
+#include <raft/sparse/convert/detail/bitmap_to_csr.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <assert.h>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+namespace detail {
+
+template <typename index_t, typename nnz_t>
+RAFT_KERNEL repeat_csr_kernel(const index_t* indptr,
+                              const index_t* indices,
+                              index_t* repeated_indptr,
+                              index_t* repeated_indices,
+                              nnz_t nnz,
+                              index_t repeat_count)
+{
+  int global_id                  = blockIdx.x * blockDim.x + threadIdx.x;
+  bool guard                     = global_id < nnz;
+  index_t* repeated_indices_addr = repeated_indices + global_id;
+
+  for (index_t i = global_id; i < repeat_count; i += gridDim.x * blockDim.x) {
+    repeated_indptr[i] = (i + 2) * nnz;
+  }
+
+  __syncthreads();
+
+  index_t item;
+  item = (global_id < nnz) ? indices[global_id] : -1;
+
+  __syncthreads();
+
+  for (index_t row = 0; row < repeat_count; ++row) {
+    index_t start_offset = row * nnz;
+    if (guard) { repeated_indices_addr[start_offset] = item; }
+  }
+}
+
+template <typename index_t, typename nnz_t>
+void gpu_repeat_csr(raft::resources const& handle,
+                    const index_t* d_indptr,
+                    const index_t* d_indices,
+                    nnz_t nnz,
+                    index_t repeat_count,
+                    index_t* d_repeated_indptr,
+                    index_t* d_repeated_indices)
+{
+  if (nnz == 0) return;
+
+  auto stream            = resource::get_cuda_stream(handle);
+  index_t repeat_csr_tpb = 256;
+  index_t grid           = (nnz + repeat_csr_tpb - 1) / (repeat_csr_tpb);
+
+  repeat_csr_kernel<<<grid, repeat_csr_tpb, 0, stream>>>(
+    d_indptr, d_indices, d_repeated_indptr, d_repeated_indices, nnz, repeat_count);
+}
+
+template <typename bitset_t,
+          typename index_t,
+          typename csr_matrix_t,
+          typename = std::enable_if_t<raft::is_device_csr_matrix_v<csr_matrix_t>>>
+void bitset_to_csr(raft::resources const& handle,
+                   raft::core::bitset_view<bitset_t, index_t> bitset,
+                   csr_matrix_t& csr)
+{
+  using row_t = typename csr_matrix_t::row_type;
+  using nnz_t = typename csr_matrix_t::nnz_type;
+
+  auto csr_view = csr.structure_view();
+
+  RAFT_EXPECTS(bitset.size() == csr_view.get_n_cols(),
+               "Number of size in bitset must be equal to "
+               "number of columns in csr");
+  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0) { return; }
+
+  auto thrust_policy = resource::get_thrust_policy(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+
+  index_t* indptr  = csr_view.get_indptr().data();
+  index_t* indices = csr_view.get_indices().data();
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+
+  size_t sub_nnz_size      = 0;
+  index_t bits_per_sub_col = 0;
+
+  // Get buffer size and number of bits per each sub-columns
+  calc_nnz_by_rows(handle,
+                   bitset.data(),
+                   row_t(1),
+                   csr_view.get_n_cols(),
+                   static_cast<nnz_t*>(nullptr),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
+  rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
+
+  calc_nnz_by_rows(handle,
+                   bitset.data(),
+                   row_t(1),
+                   csr_view.get_n_cols(),
+                   sub_nnz.data(),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  thrust::exclusive_scan(
+    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+
+  nnz_t bitset_nnz = 0;
+  if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      &bitset_nnz, sub_nnz.data() + sub_nnz_size, sizeof(nnz_t), cudaMemcpyDeviceToHost, stream));
+    resource::sync_stream(handle);
+    csr.initialize_sparsity(bitset_nnz * csr_view.get_n_rows());
+    if (bitset_nnz == 0) return;
+  } else {
+    bitset_nnz = csr_view.get_nnz() / csr_view.get_n_rows();
+  }
+
+  constexpr bool check_nnz = is_device_csr_sparsity_preserving_v<csr_matrix_t>;
+  fill_indices_by_rows<bitset_t, index_t, nnz_t, check_nnz>(handle,
+                                                            bitset.data(),
+                                                            indptr,
+                                                            1,
+                                                            csr_view.get_n_cols(),
+                                                            csr_view.get_nnz(),
+                                                            indices,
+                                                            sub_nnz.data(),
+                                                            bits_per_sub_col,
+                                                            sub_nnz_size);
+  if (csr_view.get_n_rows() > 1) {
+    gpu_repeat_csr<index_t, nnz_t>(handle,
+                                   indptr,
+                                   indices,
+                                   bitset_nnz,
+                                   csr_view.get_n_rows() - 1,
+                                   indptr + 2,
+                                   indices + bitset_nnz);
+  }
+
+  thrust::fill_n(thrust_policy,
+                 csr.get_elements().data(),
+                 csr_view.get_nnz(),
+                 typename csr_matrix_t::element_type(1));
+}
+
+};  // end NAMESPACE detail
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
index 276960628d..bfffa413b2 100644
--- a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -41,7 +42,7 @@ template <typename value_t, typename output_t, typename index_t, typename nnz_t,
 void masked_matmul(raft::resources const& handle,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
-                   raft::core::bitmap_view<const bitmap_t, index_t>& mask,
+                   raft::core::bitmap_view<bitmap_t, index_t>& mask,
                    raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t>& C,
                    std::optional<raft::host_scalar_view<output_t>> alpha,
                    std::optional<raft::host_scalar_view<output_t>> beta)
@@ -100,6 +101,69 @@ void masked_matmul(raft::resources const& handle,
   }
 }
 
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitset_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
+                   raft::core::bitset_view<bitset_t, index_t>& mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t>& C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha,
+                   std::optional<raft::host_scalar_view<output_t>> beta)
+{
+  index_t m   = A.extent(0);
+  index_t n   = B.extent(0);
+  index_t dim = A.extent(1);
+
+  auto compressed_C_view = C.structure_view();
+
+  RAFT_EXPECTS(A.extent(1) == B.extent(1), "The dim of A must be equal to the dim of B.");
+  RAFT_EXPECTS(A.extent(0) == compressed_C_view.get_n_rows(),
+               "Number of rows in C must match the number of rows in A.");
+  RAFT_EXPECTS(B.extent(0) == compressed_C_view.get_n_cols(),
+               "Number of columns in C must match the number of columns in B.");
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  auto C_matrix = raft::make_device_csr_matrix<output_t, index_t>(handle, compressed_C_view);
+
+  // fill C
+  raft::sparse::convert::bitset_to_csr(handle, mask, C_matrix);
+
+  if (m > 10 || alpha.has_value() || beta.has_value()) {
+    auto C_view = raft::make_device_csr_matrix_view<output_t, index_t, index_t, index_t>(
+      C.get_elements().data(), compressed_C_view);
+
+    // create B col_major view
+    auto B_col_major = raft::make_device_matrix_view<const value_t, index_t, raft::col_major>(
+      B.data_handle(), dim, n);
+
+    output_t default_alpha = static_cast<output_t>(1.0f);
+    output_t default_beta  = static_cast<output_t>(0.0f);
+
+    if (!alpha.has_value()) { alpha = raft::make_host_scalar_view<output_t>(&default_alpha); }
+    if (!beta.has_value()) { beta = raft::make_host_scalar_view<output_t>(&default_beta); }
+
+    raft::sparse::linalg::sddmm(handle,
+                                A,
+                                B_col_major,
+                                C_view,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                *alpha,
+                                *beta);
+  } else {
+    raft::sparse::distance::detail::faster_dot_on_csr(handle,
+                                                      C.get_elements().data(),
+                                                      compressed_C_view.get_nnz(),
+                                                      compressed_C_view.get_indptr().data(),
+                                                      compressed_C_view.get_indices().data(),
+                                                      A.data_handle(),
+                                                      B.data_handle(),
+                                                      compressed_C_view.get_n_rows(),
+                                                      dim);
+  }
+}
+
 }  // namespace detail
 }  // namespace linalg
 }  // namespace sparse
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/masked_matmul.cuh
new file mode 100644
index 0000000000..c33a1afd43
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain A copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/sparse/linalg/detail/masked_matmul.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @defgroup masked_matmul Masked Matrix Multiplication
+ * @{
+ */
+
+/**
+ * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
+ * multiplication with the sparsity pattern defined by the mask, resulting in the computation
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
+ *
+ * This function multiplies two dense matrices A and B, and then applies an element-wise
+ * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
+ * and added to beta times the original matrix C.
+ *
+ * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
+ * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
+ * @tparam index_t Type used for matrix indices
+ * @tparam nnz_t Type used for the number of non-zero entries in CSR format
+ * @tparam bitmap_t Type of the bitmap used for the mask
+ *
+ * @param[in] handle RAFT handle for resource management
+ * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
+ * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
+ * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
+ * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
+ * included (1) or masked out (0).
+ * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
+ * n]
+ * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
+ * std::nullopt)
+ * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ */
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitmap_view<bitmap_t, index_t> mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/**
+ * @brief Computes a sparse matrix product with a masked sparsity pattern and scaling.
+ *
+ * This function computes the result of:
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C
+ * where:
+ * - A and B are dense input matrices.
+ * - "mask" defines the sparsity pattern for element-wise multiplication.
+ * - The result is scaled by alpha and added to beta times the original C.
+ *
+ * **Special behavior of the mask**:
+ * - The `bitset` mask represents a single row of data, with its bits indicating whether
+ *   each corresponding element in (A * B) is included (1) or masked out (0).
+ * - If the output CSR matrix `C` has multiple rows, the `bitset` is logically repeated
+ *   across all rows of `C`. For example, if `C` has `n_rows` rows, the same `bitset`
+ *   pattern is applied to all rows.
+ *
+ * @tparam value_t    Data type of input matrix elements (e.g., half, float, double).
+ * @tparam output_t   Data type of output matrix elements (e.g., float, double).
+ * @tparam index_t    Type for matrix indices.
+ * @tparam nnz_t      Type for non-zero entries in CSR format.
+ * @tparam bitset_t   Type for the bitset mask.
+ *
+ * @param[in] handle  RAFT handle for managing resources.
+ * @param[in] A       Dense input matrix [m, k] (row-major).
+ * @param[in] B       Dense input matrix [n, k] (row-major).
+ * @param[in] mask    Bitmap view representing a single row [1, n], where each bit
+ *                    indicates if the corresponding element in (A * B) is included (1)
+ *                    or masked out (0). The pattern is repeated for all rows of `C`.
+ * @param[inout] C    Output sparse matrix in CSR format [m, n].
+ * @param[in] alpha   Scalar multiplier for (A * B) (default: 1.0 if std::nullopt).
+ * @param[in] beta    Scalar multiplier for the initial C (default: 0 if std::nullopt).
+ */
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitset_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitset_view<bitset_t, index_t> mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/** @} */  // end of masked_matmul
+
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.hpp b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
index 6cf6e834b9..32322b90f6 100644
--- a/cpp/include/raft/sparse/linalg/masked_matmul.hpp
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
@@ -13,60 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
-#include <raft/sparse/linalg/detail/masked_matmul.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @defgroup masked_matmul Masked Matrix Multiplication
- * @{
+ * This file is deprecated and will be removed in future release.
+ * Please use the cuh version instead.
  */
 
 /**
- * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
- * multiplication with the sparsity pattern defined by the mask, resulting in the computation
- * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
- *
- * This function multiplies two dense matrices A and B, and then applies an element-wise
- * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
- * and added to beta times the original matrix C.
- *
- * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
- * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
- * @tparam index_t Type used for matrix indices
- * @tparam nnz_t Type used for the number of non-zero entries in CSR format
- * @tparam bitmap_t Type of the bitmap used for the mask
- *
- * @param[in] handle RAFT handle for resource management
- * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
- * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
- * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
- * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
- * included (1) or masked out (0).
- * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
- * n]
- * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
- * std::nullopt)
- * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ * DISCLAIMER: this file is deprecated: use masked_matmul.cuh instead
  */
-template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
-void masked_matmul(raft::resources const& handle,
-                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
-                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
-                   raft::core::bitmap_view<const bitmap_t, index_t> mask,
-                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
-                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
-                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
-{
-  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
-}
 
-/** @} */  // end of masked_matmul
+#pragma once
+
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the cuh version instead.")
+#endif
 
-}  // end namespace linalg
-}  // end namespace sparse
-}  // end namespace raft
+#include <raft/sparse/linalg/masked_matmul.cuh>
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index c1a495ea3d..d74296a267 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -17,6 +17,7 @@
 #include "../test_utils.cuh"
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -370,7 +371,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
         raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_rows, params.n_cols, nnz);
       auto csr_view = csr.structure_view();
 
-      convert::bitmap_to_csr(handle, bitmap, csr);
+      bitmap.to_csr(handle, csr);
       raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
       raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
       raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
@@ -379,7 +380,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
         indptr_d.data(), indices_d.data(), params.n_rows, params.n_cols, nnz);
       auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
 
-      convert::bitmap_to_csr(handle, bitmap, csr);
+      bitmap.to_csr(handle, csr);
       raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
     }
     resource::sync_stream(handle);
@@ -477,5 +478,289 @@ INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
                         BitmapToCSRTestLOnLargeSize,
                         ::testing::ValuesIn(bitmaptocsr_large_inputs<int64_t>));
 
+/******************************** bitset to csr ********************************/
+
+template <typename index_t>
+struct BitsetToCSRInputs {
+  index_t n_repeat;
+  index_t n_cols;
+  float sparsity;
+  bool owning;
+};
+
+template <typename bitset_t, typename index_t, typename value_t>
+class BitsetToCSRTest : public ::testing::TestWithParam<BitsetToCSRInputs<index_t>> {
+ public:
+  BitsetToCSRTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<BitsetToCSRInputs<index_t>>::GetParam()),
+      bitset_d(0, stream),
+      indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      indptr_expected_d(0, stream),
+      indices_expected_d(0, stream),
+      values_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  void repeat_cpu_bitset(std::vector<bitset_t>& input,
+                         size_t input_bits,
+                         size_t repeat,
+                         std::vector<bitset_t>& output)
+  {
+    const size_t output_bits  = input_bits * repeat;
+    const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+
+    std::memset(output.data(), 0, output_units * sizeof(bitset_t));
+
+    size_t output_bit_index = 0;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bitset_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bitset_t) * 8);
+        bool bit                = (input[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8);
+
+        output[output_unit_index] |= (static_cast<bitset_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitset) {
+      item = static_cast<bitset_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitset_t& element    = bitset[index / (8 * sizeof(bitset_t))];
+      index_t bit_position = index % (8 * sizeof(bitset_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void cpu_convert_to_csr(std::vector<bitset_t>& bitset,
+                          index_t rows,
+                          index_t cols,
+                          std::vector<index_t>& indices,
+                          std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    index_t index        = 0;
+    bitset_t element     = 0;
+    index_t bit_position = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        index        = i * cols + j;
+        element      = bitset[index / (8 * sizeof(bitset_t))];
+        bit_position = index % (8 * sizeof(bitset_t));
+
+        if (((element >> bit_position) & 1)) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  bool csr_compare(const std::vector<index_t>& row_ptrs1,
+                   const std::vector<index_t>& col_indices1,
+                   const std::vector<index_t>& row_ptrs2,
+                   const std::vector<index_t>& col_indices2)
+  {
+    if (row_ptrs1.size() != row_ptrs2.size()) { return false; }
+
+    if (col_indices1.size() != col_indices2.size()) { return false; }
+
+    if (!std::equal(row_ptrs1.begin(), row_ptrs1.end(), row_ptrs2.begin())) { return false; }
+
+    for (size_t i = 0; i < row_ptrs1.size() - 1; ++i) {
+      size_t start_idx = row_ptrs1[i];
+      size_t end_idx   = row_ptrs1[i + 1];
+
+      std::vector<index_t> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
+      std::vector<index_t> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
+
+      std::sort(cols1.begin(), cols1.end());
+      std::sort(cols2.begin(), cols2.end());
+
+      if (cols1 != cols2) { return false; }
+    }
+
+    return true;
+  }
+
+  void SetUp() override
+  {
+    index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> bitset_h(element);
+    std::vector<bitset_t> bitset_repeat_h(element * params.n_repeat);
+
+    nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);
+
+    repeat_cpu_bitset(bitset_h, size_t(params.n_cols), size_t(params.n_repeat), bitset_repeat_h);
+    nnz *= params.n_repeat;
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_repeat + 1);
+
+    cpu_convert_to_csr(bitset_repeat_h, params.n_repeat, params.n_cols, indices_h, indptr_h);
+
+    bitset_d.resize(bitset_h.size(), stream);
+    indptr_d.resize(params.n_repeat + 1, stream);
+    indices_d.resize(nnz, stream);
+
+    indptr_expected_d.resize(params.n_repeat + 1, stream);
+    indices_expected_d.resize(nnz, stream);
+    values_expected_d.resize(nnz, stream);
+
+    thrust::fill_n(resource::get_thrust_policy(handle), values_expected_d.data(), nnz, value_t{1});
+
+    values_d.resize(nnz, stream);
+
+    update_device(indices_expected_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_expected_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), params.n_cols);
+
+    if (params.owning) {
+      auto csr =
+        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_repeat, params.n_cols, nnz);
+      auto csr_view = csr.structure_view();
+
+      bitset.to_csr(handle, csr);
+      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+    } else {
+      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+        indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
+
+      bitset.to_csr(handle, csr);
+      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+    }
+    resource::sync_stream(handle);
+
+    std::vector<index_t> indices_h(indices_expected_d.size(), 0);
+    std::vector<index_t> indices_expected_h(indices_expected_d.size(), 0);
+    update_host(indices_h.data(), indices_d.data(), indices_h.size(), stream);
+    update_host(indices_expected_h.data(), indices_expected_d.data(), indices_h.size(), stream);
+
+    std::vector<index_t> indptr_h(indptr_expected_d.size(), 0);
+    std::vector<index_t> indptr_expected_h(indptr_expected_d.size(), 0);
+    update_host(indptr_h.data(), indptr_d.data(), indptr_h.size(), stream);
+    update_host(indptr_expected_h.data(), indptr_expected_d.data(), indptr_h.size(), stream);
+
+    resource::sync_stream(handle);
+
+    ASSERT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h));
+    ASSERT_TRUE(raft::devArrMatch<value_t>(
+      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  BitsetToCSRInputs<index_t> params;
+
+  rmm::device_uvector<bitset_t> bitset_d;
+
+  index_t nnz;
+
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<float> values_d;
+
+  rmm::device_uvector<index_t> indptr_expected_d;
+  rmm::device_uvector<index_t> indices_expected_d;
+  rmm::device_uvector<float> values_expected_d;
+};
+
+using BitsetToCSRTestI = BitsetToCSRTest<uint32_t, int, float>;
+TEST_P(BitsetToCSRTestI, Result) { Run(); }
+
+using BitsetToCSRTestL = BitsetToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitsetToCSRTestL, Result) { Run(); }
+
+using BitsetToCSRTestLOnLargeSize = BitsetToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitsetToCSRTestLOnLargeSize, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<BitsetToCSRInputs<index_t>> bitsettocsr_inputs = {
+  {0, 0, 0.2, false},
+  {10, 32, 0.4, false},
+  {10, 3, 0.2, false},
+  {32, 1024, 0.4, false},
+  {1024, 1048576, 0.01, false},
+  {1024, 1024, 0.4, false},
+  {64 * 1024 + 10, 2, 0.3, false},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.3, false},             // No peeling-remainder
+  {17, 16, 0.3, false},             // Check peeling-remainder
+  {18, 16, 0.3, false},             // Check peeling-remainder
+  {32 + 9, 33, 0.2, false},         // Check peeling-remainder
+  {2, 33, 0.2, false},              // Check peeling-remainder
+  {0, 0, 0.2, true},
+  {10, 32, 0.4, true},
+  {10, 3, 0.2, true},
+  {32, 1024, 0.4, true},
+  {1024, 1048576, 0.01, true},
+  {1024, 1024, 0.4, true},
+  {64 * 1024 + 10, 2, 0.3, true},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.3, true},             // No peeling-remainder
+  {17, 16, 0.3, true},             // Check peeling-remainder
+  {18, 16, 0.3, true},             // Check peeling-remainder
+  {32 + 9, 33, 0.2, true},         // Check peeling-remainder
+  {2, 33, 0.2, true},              // Check peeling-remainder
+};
+
+template <typename index_t>
+const std::vector<BitsetToCSRInputs<index_t>> bitsettocsr_large_inputs = {
+  {100, 100000000, 0.01, true}, {100, 100000000, 0.05, false}, {100, 100000000 + 17, 0.05, false}};
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestI,
+                        ::testing::ValuesIn(bitsettocsr_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestL,
+                        ::testing::ValuesIn(bitsettocsr_inputs<int64_t>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestLOnLargeSize,
+                        ::testing::ValuesIn(bitsettocsr_large_inputs<int64_t>));
+
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/test/sparse/masked_matmul.cu
index f883beae32..5ee1677015 100644
--- a/cpp/test/sparse/masked_matmul.cu
+++ b/cpp/test/sparse/masked_matmul.cu
@@ -19,7 +19,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/make_blobs.cuh>
-#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/sparse/linalg/masked_matmul.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <thrust/reduce.h>
@@ -46,6 +46,8 @@ struct MaskedMatmulInputs {
   unsigned long long int seed;
 };
 
+enum class BitsLayout { Bitset, Bitmap };
+
 template <typename value_t>
 struct sum_abs_op {
   __host__ __device__ value_t operator()(const value_t& x, const value_t& y) const
@@ -87,7 +89,8 @@ bool isCuSparseVersionGreaterThan_12_0_1()
 template <typename value_t,
           typename output_t,
           typename index_t,
-          typename bitmap_t      = uint32_t,
+          BitsLayout bits_layout = BitsLayout::Bitmap,
+          typename bits_t        = uint32_t,
           typename LayoutPolicyA = raft::row_major,
           typename LayoutPolicyB = raft::row_major>
 class MaskedMatmulTest
@@ -98,7 +101,7 @@ class MaskedMatmulTest
       stream(resource::get_cuda_stream(handle)),
       a_data_d(0, resource::get_cuda_stream(handle)),
       b_data_d(0, resource::get_cuda_stream(handle)),
-      bitmap_d(0, resource::get_cuda_stream(handle)),
+      bits_d(0, resource::get_cuda_stream(handle)),
       c_indptr_d(0, resource::get_cuda_stream(handle)),
       c_indices_d(0, resource::get_cuda_stream(handle)),
       c_data_d(0, resource::get_cuda_stream(handle)),
@@ -107,14 +110,14 @@ class MaskedMatmulTest
   }
 
  protected:
-  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
   {
     index_t total    = static_cast<index_t>(m * n);
     index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
     index_t res      = num_ones;
 
-    for (auto& item : bitmap) {
-      item = static_cast<bitmap_t>(0);
+    for (auto& item : bits) {
+      item = static_cast<bits_t>(0);
     }
 
     std::random_device rd;
@@ -124,8 +127,8 @@ class MaskedMatmulTest
     while (num_ones > 0) {
       index_t index = dis(gen);
 
-      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
-      index_t bit_position = index % (8 * sizeof(bitmap_t));
+      bits_t& element      = bits[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
         element |= (static_cast<index_t>(1) << bit_position);
@@ -135,7 +138,27 @@ class MaskedMatmulTest
     return res;
   }
 
-  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+  void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
+  {
+    size_t output_bit_index = input_bits;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bits_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bits_t) * 8);
+        bool bit                = (inout[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);
+
+        inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  void cpu_convert_to_csr(std::vector<bits_t>& bits,
                           index_t rows,
                           index_t cols,
                           std::vector<index_t>& indices,
@@ -146,14 +169,14 @@ class MaskedMatmulTest
     indptr[offset_indptr++] = 0;
 
     index_t index        = 0;
-    bitmap_t element     = 0;
+    bits_t element       = 0;
     index_t bit_position = 0;
 
     for (index_t i = 0; i < rows; ++i) {
       for (index_t j = 0; j < cols; ++j) {
         index        = i * cols + j;
-        element      = bitmap[index / (8 * sizeof(bitmap_t))];
-        bit_position = index % (8 * sizeof(bitmap_t));
+        element      = bits[index / (8 * sizeof(bits_t))];
+        bit_position = index % (8 * sizeof(bits_t));
 
         if (((element >> bit_position) & 1)) {
           indices[offset_values] = static_cast<index_t>(j);
@@ -201,15 +224,17 @@ class MaskedMatmulTest
     index_t b_size = params.k * params.n;
     index_t c_size = params.m * params.n;
 
-    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bitmap_t) * 8));
-    std::vector<bitmap_t> bitmap_h(element);
+    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
+
+    std::memset(bits_h.data(), 0, bits_h.size() * sizeof(bits_t));
 
     std::vector<value_t> a_data_h(a_size);
     std::vector<value_t> b_data_h(b_size);
 
     a_data_d.resize(a_size, stream);
     b_data_d.resize(b_size, stream);
-    bitmap_d.resize(bitmap_h.size(), stream);
+    bits_d.resize(bits_h.size(), stream);
 
     auto blobs_a_b = raft::make_device_matrix<output_t, index_t>(handle, 1, a_size + b_size);
     auto labels    = raft::make_device_vector<index_t, index_t>(handle, 1);
@@ -262,18 +287,27 @@ class MaskedMatmulTest
 
     resource::sync_stream(handle);
 
-    index_t c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+    index_t c_true_nnz = 0;
+    if constexpr (bits_layout == BitsLayout::Bitmap) {
+      c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
+    } else if constexpr (bits_layout == BitsLayout::Bitset) {
+      c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
+      repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
+      c_true_nnz *= params.m;
+    } else {
+      GTEST_SKIP() << "Unsupported BitsLayout!";
+    }
 
     std::vector<index_t> c_indptr_h(params.m + 1);
     std::vector<index_t> c_indices_h(c_true_nnz);
     std::vector<output_t> c_data_h(c_true_nnz);
 
-    cpu_convert_to_csr(bitmap_h, params.m, params.n, c_indices_h, c_indptr_h);
+    cpu_convert_to_csr(bits_h, params.m, params.n, c_indices_h, c_indptr_h);
 
     c_data_d.resize(c_data_h.size(), stream);
 
     update_device(c_data_d.data(), c_data_h.data(), c_data_h.size(), stream);
-    update_device(bitmap_d.data(), bitmap_h.data(), bitmap_h.size(), stream);
+    update_device(bits_d.data(), bits_h.data(), bits_h.size(), stream);
     resource::sync_stream(handle);
 
     cpu_sddmm(a_data_h, b_data_h, c_data_h, c_indices_h, c_indptr_h, true, true);
@@ -304,9 +338,6 @@ class MaskedMatmulTest
     auto B =
       raft::make_device_matrix_view<const value_t, index_t>(b_data_d.data(), params.n, params.k);
 
-    auto mask =
-      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
-
     auto c_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
       c_indptr_d.data(),
       c_indices_d.data(),
@@ -316,7 +347,15 @@ class MaskedMatmulTest
 
     auto C = raft::make_device_csr_matrix_view<output_t>(c_data_d.data(), c_structure);
 
-    raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    if constexpr (bits_layout == BitsLayout::Bitmap) {
+      auto mask = raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    } else if constexpr (bits_layout == BitsLayout::Bitset) {
+      auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    } else {
+      GTEST_SKIP() << "Unsupported BitsLayout!";
+    }
 
     resource::sync_stream(handle);
 
@@ -344,7 +383,7 @@ class MaskedMatmulTest
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
-  rmm::device_uvector<bitmap_t> bitmap_d;
+  rmm::device_uvector<bits_t> bits_d;
 
   rmm::device_uvector<index_t> c_indptr_d;
   rmm::device_uvector<index_t> c_indices_d;
@@ -353,14 +392,23 @@ class MaskedMatmulTest
   rmm::device_uvector<output_t> c_expected_data_d;
 };
 
-using MaskedMatmulTestF = MaskedMatmulTest<float, float, int>;
-TEST_P(MaskedMatmulTestF, Result) { Run(); }
+using MaskedMatmulOnBitmapTestF = MaskedMatmulTest<float, float, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestF, Result) { Run(); }
+
+using MaskedMatmulOnBitmapTestD = MaskedMatmulTest<double, double, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestD, Result) { Run(); }
 
-using MaskedMatmulTestD = MaskedMatmulTest<double, double, int>;
-TEST_P(MaskedMatmulTestD, Result) { Run(); }
+using MaskedMatmulOnBitmapTestH = MaskedMatmulTest<half, float, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestH, Result) { Run(); }
 
-using MaskedMatmulTestH = MaskedMatmulTest<half, float, int>;
-TEST_P(MaskedMatmulTestH, Result) { Run(); }
+using MaskedMatmulOnBitsetTestF = MaskedMatmulTest<float, float, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestF, Result) { Run(); }
+
+using MaskedMatmulOnBitsetTestD = MaskedMatmulTest<double, double, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestD, Result) { Run(); }
+
+using MaskedMatmulOnBitsetTestH = MaskedMatmulTest<half, float, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestH, Result) { Run(); }
 
 const std::vector<MaskedMatmulInputs<float, float, int>> sddmm_inputs_f = {
   {0.001f, 2, 255, 1023, 0.19, 1234ULL},
@@ -419,11 +467,29 @@ const std::vector<MaskedMatmulInputs<half, float, int>> sddmm_inputs_h = {
   {0.0003f, 31, 1025, 1025, 0.19, 1234ULL},
   {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}};
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestF, ::testing::ValuesIn(sddmm_inputs_f));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestF,
+                        ::testing::ValuesIn(sddmm_inputs_f));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestD,
+                        ::testing::ValuesIn(sddmm_inputs_d));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestH,
+                        ::testing::ValuesIn(sddmm_inputs_h));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestF,
+                        ::testing::ValuesIn(sddmm_inputs_f));
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestD, ::testing::ValuesIn(sddmm_inputs_d));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestD,
+                        ::testing::ValuesIn(sddmm_inputs_d));
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestH, ::testing::ValuesIn(sddmm_inputs_h));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestH,
+                        ::testing::ValuesIn(sddmm_inputs_h));
 
 }  // namespace sparse
 }  // namespace raft

From 8ea0e7e71ad4360b99d3a45aa0b2f124cf01abd1 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Thu, 16 Jan 2025 17:22:38 +0100
Subject: [PATCH 21/37] Fix broken link to python doc (#2537)

Apply the same change as https://github.com/rapidsai/cuml/pull/6202 to fix Python links to source code

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2537
---
 docs/source/conf.py                  |  2 +-
 docs/source/sphinxext/github_link.py | 24 +++++++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7a287b689f..e5e6e0871a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -208,7 +208,7 @@ def setup(app):
 linkcode_resolve = make_linkcode_resolve(
     "pylibraft",
     "https://github.com/rapidsai/raft"
-    "raft/blob/{revision}/python/pylibraft"
+    "/blob/{revision}/python/pylibraft/"
     "{package}/{path}#L{lineno}",
 )
 
diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py
index a7a46fdd9d..5712bbe5cb 100644
--- a/docs/source/sphinxext/github_link.py
+++ b/docs/source/sphinxext/github_link.py
@@ -1,5 +1,20 @@
 # This contains code with copyright by the scikit-learn project, subject to the
 # license in /thirdparty/LICENSES/LICENSE.scikit_learn
+#
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 import inspect
 import os
@@ -96,15 +111,14 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision):
             # fn is expected to be the absolute path.
             fn = os.path.relpath(source_file, start=package)
             print("{}:{}".format(
-                os.path.abspath(os.path.join("..", "python", "cuml", fn)),
+                os.path.abspath(os.path.join("..", "python", "pylibraft", fn)),
                 lineno))
         else:
             return
     else:
-        # Test if we are absolute or not (pyx are relative)
-        if (not os.path.isabs(fn)):
-            # Should be relative to docs right now
-            fn = os.path.abspath(os.path.join("..", "python", fn))
+        if fn.endswith(".pyx"):
+            sp_path = next(x for x in sys.path if re.match(".*site-packages$", x))
+            fn = fn.replace("/opt/conda/conda-bld/work/python/pylibraft", sp_path)
 
         # Convert to relative from module root
         fn = os.path.relpath(fn,

From fb6bfe6ee956a5e40295300d453f1261ece3cedf Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Thu, 16 Jan 2025 19:19:59 +0100
Subject: [PATCH 22/37] Introduction of the `raft::device_resources_snmg` type
 (#2487)

Introduces the `raft::device_resources_snmg` type to hold all resources required for the NCCL clique.

~Answers https://github.com/rapidsai/raft/issues/2459~
Removed call to `raft::comms::build_comms_nccl_only` (https://github.com/rapidsai/raft/issues/2465)

Authors:
  - Victor Lafargue (https://github.com/viclafargue)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2487
---
 cpp/include/raft/comms/nccl_clique.hpp        | 156 -------------
 .../raft/core/device_resources_snmg.hpp       | 217 ++++++++++++++++++
 .../raft/core/resource/nccl_clique.hpp        |  66 ------
 cpp/include/raft/core/resources.hpp           |   3 +-
 docs/source/cpp_api/core_resources.rst        |  17 ++
 5 files changed, 236 insertions(+), 223 deletions(-)
 delete mode 100644 cpp/include/raft/comms/nccl_clique.hpp
 create mode 100644 cpp/include/raft/core/device_resources_snmg.hpp
 delete mode 100644 cpp/include/raft/core/resource/nccl_clique.hpp

diff --git a/cpp/include/raft/comms/nccl_clique.hpp b/cpp/include/raft/comms/nccl_clique.hpp
deleted file mode 100644
index c6520af753..0000000000
--- a/cpp/include/raft/comms/nccl_clique.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/device_resources.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <nccl.h>
-
-/**
- * @brief Error checking macro for NCCL runtime API functions.
- *
- * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
- * exception detailing the NCCL error that occurred
- */
-#define RAFT_NCCL_TRY(call)                        \
-  do {                                             \
-    ncclResult_t const status = (call);            \
-    if (ncclSuccess != status) {                   \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "NCCL error encountered at: ", \
-                    "call='%s', Reason=%d:%s",     \
-                    #call,                         \
-                    status,                        \
-                    ncclGetErrorString(status));   \
-      throw raft::logic_error(msg);                \
-    }                                              \
-  } while (0);
-
-namespace raft::comms {
-void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank);
-}
-
-namespace raft::comms {
-
-struct nccl_clique {
-  using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-
-  /**
-   * Instantiates a NCCL clique with all available GPUs
-   *
-   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
-   *
-   */
-  nccl_clique(int percent_of_free_memory = 80)
-    : root_rank_(0),
-      percent_of_free_memory_(percent_of_free_memory),
-      per_device_pools_(0),
-      device_resources_(0)
-  {
-    cudaGetDeviceCount(&num_ranks_);
-    device_ids_.resize(num_ranks_);
-    std::iota(device_ids_.begin(), device_ids_.end(), 0);
-    nccl_comms_.resize(num_ranks_);
-    nccl_clique_init();
-  }
-
-  /**
-   * Instantiates a NCCL clique
-   *
-   * Usage example:
-   * @code{.cpp}
-   * int n_devices;
-   * cudaGetDeviceCount(&n_devices);
-   * std::vector<int> device_ids(n_devices);
-   * std::iota(device_ids.begin(), device_ids.end(), 0);
-   * cuvs::neighbors::mg::nccl_clique& clique(device_ids); // first device is the root rank
-   * @endcode
-   *
-   * @param[in] device_ids list of device IDs to be used to initiate the clique
-   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
-   *
-   */
-  nccl_clique(const std::vector<int>& device_ids, int percent_of_free_memory = 80)
-    : root_rank_(0),
-      num_ranks_(device_ids.size()),
-      percent_of_free_memory_(percent_of_free_memory),
-      device_ids_(device_ids),
-      nccl_comms_(device_ids.size()),
-      per_device_pools_(0),
-      device_resources_(0)
-  {
-    nccl_clique_init();
-  }
-
-  void nccl_clique_init()
-  {
-    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), num_ranks_, device_ids_.data()));
-
-    for (int rank = 0; rank < num_ranks_; rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
-
-      // create a pool memory resource for each device
-      auto old_mr = rmm::mr::get_current_device_resource();
-      per_device_pools_.push_back(std::make_unique<pool_mr>(
-        old_mr, rmm::percent_of_free_device_memory(percent_of_free_memory_)));
-      rmm::cuda_device_id id(device_ids_[rank]);
-      rmm::mr::set_per_device_resource(id, per_device_pools_.back().get());
-
-      // create a device resource handle for each device
-      device_resources_.emplace_back();
-
-      // add NCCL communications to the device resource handle
-      raft::comms::build_comms_nccl_only(
-        &device_resources_[rank], nccl_comms_[rank], num_ranks_, rank);
-    }
-
-    for (int rank = 0; rank < num_ranks_; rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
-      raft::resource::sync_stream(device_resources_[rank]);
-    }
-  }
-
-  const raft::device_resources& set_current_device_to_root_rank() const
-  {
-    int root_device_id = device_ids_[root_rank_];
-    RAFT_CUDA_TRY(cudaSetDevice(root_device_id));
-    return device_resources_[root_rank_];
-  }
-
-  ~nccl_clique()
-  {
-#pragma omp parallel for  // necessary to avoid hangs
-    for (int rank = 0; rank < num_ranks_; rank++) {
-      cudaSetDevice(device_ids_[rank]);
-      ncclCommDestroy(nccl_comms_[rank]);
-      rmm::cuda_device_id id(device_ids_[rank]);
-      rmm::mr::set_per_device_resource(id, nullptr);
-    }
-  }
-
-  int root_rank_;
-  int num_ranks_;
-  int percent_of_free_memory_;
-  std::vector<int> device_ids_;
-  std::vector<ncclComm_t> nccl_comms_;
-  std::vector<std::shared_ptr<pool_mr>> per_device_pools_;
-  std::vector<raft::device_resources> device_resources_;
-};
-
-}  // namespace raft::comms
diff --git a/cpp/include/raft/core/device_resources_snmg.hpp b/cpp/include/raft/core/device_resources_snmg.hpp
new file mode 100644
index 0000000000..f20a81a1c6
--- /dev/null
+++ b/cpp/include/raft/core/device_resources_snmg.hpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_resources.hpp>
+
+#include <nccl.h>
+#include <omp.h>
+
+#include <memory>
+#include <vector>
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define RAFT_NCCL_TRY(call)                        \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
+  } while (0);
+
+namespace raft {
+
+/**
+ * @brief SNMG (single-node multi-GPU) resource container object that stores a NCCL clique and all
+ * necessary resources used for calling device functions, cuda kernels, libraries and/or NCCL
+ * communications on each GPU. Note the `device_resources_snmg` object can also be used as a classic
+ * `device_resources` object. The associated resources will be the ones of the GPU used during
+ * object instantiation and a GPU switch operation will be ordered during the retrieval of said
+ * resources.
+ *
+ * The `device_resources_snmg` class is intended to be used in a single process to manage several
+ * GPUs. Please note that NCCL communications are the responsibility of the user. Blocking NCCL
+ * calls will sometimes require the use of several threads to avoid hangs.
+ */
+class device_resources_snmg : public device_resources {
+ public:
+  /**
+   * @brief Construct a SNMG resources instance with all available GPUs
+   */
+  device_resources_snmg() : device_resources(), root_rank_(0)
+  {
+    cudaGetDevice(&main_gpu_id_);
+
+    int num_ranks;
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_ranks));
+    device_ids_.resize(num_ranks);
+    std::iota(device_ids_.begin(), device_ids_.end(), 0);
+    nccl_comms_.resize(num_ranks);
+    initialize();
+  }
+
+  /**
+   * @brief Construct a SNMG resources instance with a subset of available GPUs
+   *
+   * @param[in] device_ids List of device IDs to be used by the NCCL clique
+   */
+  device_resources_snmg(const std::vector<int>& device_ids)
+    : device_resources(), root_rank_(0), device_ids_(device_ids), nccl_comms_(device_ids.size())
+  {
+    cudaGetDevice(&main_gpu_id_);
+
+    initialize();
+  }
+
+  /**
+   * @brief SNMG resources instance copy constructor
+   *
+   * @param[in] clique A SNMG resources instance
+   */
+  device_resources_snmg(const device_resources_snmg& clique)
+    : device_resources(clique),
+      root_rank_(clique.root_rank_),
+      main_gpu_id_(clique.main_gpu_id_),
+      device_ids_(clique.device_ids_),
+      nccl_comms_(clique.nccl_comms_),
+      device_resources_(clique.device_resources_)
+  {
+  }
+
+  device_resources_snmg(device_resources_snmg&&)            = delete;
+  device_resources_snmg& operator=(device_resources_snmg&&) = delete;
+
+  /**
+   * @brief Set root rank of NCCL clique
+   */
+  inline int set_root_rank(int rank) { this->root_rank_ = rank; }
+
+  /**
+   * @brief Get root rank of NCCL clique
+   */
+  inline int get_root_rank() const { return this->root_rank_; }
+
+  /**
+   * @brief Get number of ranks in NCCL clique
+   */
+  inline int get_num_ranks() const { return this->device_ids_.size(); }
+
+  /**
+   * @brief Get device ID of rank in NCCL clique
+   */
+  inline int get_device_id(int rank) const { return this->device_ids_[rank]; }
+
+  /**
+   * @brief Get NCCL comm object of rank in NCCL clique
+   */
+  inline ncclComm_t get_nccl_comm(int rank) const { return this->nccl_comms_[rank]; }
+
+  /**
+   * @brief Get raft::device_resources object of rank in NCCL clique
+   */
+  inline const raft::device_resources& get_device_resources(int rank) const
+  {
+    return this->device_resources_[rank];
+  }
+
+  /**
+   * @brief Set current device ID to root rank and return its raft::device_resources object
+   */
+  inline const raft::device_resources& set_current_device_to_root_rank() const
+  {
+    return set_current_device_to_rank(get_root_rank());
+  }
+
+  /**
+   * @brief Set current device ID to rank and return its raft::device_resources object
+   */
+  inline const raft::device_resources& set_current_device_to_rank(int rank) const
+  {
+    RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+    return get_device_resources(rank);
+  }
+
+  /**
+   * @brief Set a memory pool on all GPUs of the clique
+   */
+  void set_memory_pool(int percent_of_free_memory) const
+  {
+    for (int rank = 0; rank < get_num_ranks(); rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+      size_t limit =
+        rmm::percent_of_free_device_memory(percent_of_free_memory);  // check limit for each device
+      raft::resource::set_workspace_to_pool_resource(get_device_resources(rank), limit);
+    }
+    cudaSetDevice(this->main_gpu_id_);
+  }
+
+  bool has_resource_factory(resource::resource_type resource_type) const override
+  {
+    cudaSetDevice(this->main_gpu_id_);
+    return raft::resources::has_resource_factory(resource_type);
+  }
+
+  /** Destroys all held-up resources */
+  ~device_resources_snmg()
+  {
+#pragma omp parallel for  // necessary to avoid hangs
+    for (int rank = 0; rank < get_num_ranks(); rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+      RAFT_NCCL_TRY(ncclCommDestroy(get_nccl_comm(rank)));
+    }
+    cudaSetDevice(this->main_gpu_id_);
+  }
+
+ private:
+  /**
+   * @brief Initializes the NCCL clique and raft::device_resources objects
+   */
+  void initialize()
+  {
+    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), get_num_ranks(), device_ids_.data()));
+
+    for (int rank = 0; rank < get_num_ranks(); rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+      device_resources_.emplace_back();
+
+      // ideally add the ncclComm_t to the device_resources object with
+      // raft::comms::build_comms_nccl_only
+    }
+    cudaSetDevice(this->main_gpu_id_);
+  }
+
+  int root_rank_;
+  int main_gpu_id_;
+  std::vector<int> device_ids_;
+  std::vector<ncclComm_t> nccl_comms_;
+  std::vector<raft::device_resources> device_resources_;
+
+};  // class device_resources_snmg
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/resource/nccl_clique.hpp b/cpp/include/raft/core/resource/nccl_clique.hpp
deleted file mode 100644
index edda5043ae..0000000000
--- a/cpp/include/raft/core/resource/nccl_clique.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/comms/nccl_clique.hpp>
-#include <raft/core/resource/resource_types.hpp>
-#include <raft/core/resources.hpp>
-
-#include <memory>
-
-namespace raft::resource {
-
-class nccl_clique_resource : public resource {
- public:
-  nccl_clique_resource() : clique_(std::make_unique<raft::comms::nccl_clique>()) {}
-  ~nccl_clique_resource() override {}
-  void* get_resource() override { return clique_.get(); }
-
- private:
-  std::unique_ptr<raft::comms::nccl_clique> clique_;
-};
-
-/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
-class nccl_clique_resource_factory : public resource_factory {
- public:
-  resource_type get_resource_type() override { return resource_type::NCCL_CLIQUE; }
-  resource* make_resource() override { return new nccl_clique_resource(); }
-};
-
-/**
- * @defgroup nccl_clique_resource resource functions
- * @{
- */
-
-/**
- * Retrieves a NCCL clique from raft res if it exists, otherwise initializes it and return it.
- *
- * @param[in] res the raft resources object
- * @return NCCL clique
- */
-inline const raft::comms::nccl_clique& get_nccl_clique(resources const& res)
-{
-  if (!res.has_resource_factory(resource_type::NCCL_CLIQUE)) {
-    res.add_resource_factory(std::make_shared<nccl_clique_resource_factory>());
-  }
-  return *res.get_resource<raft::comms::nccl_clique>(resource_type::NCCL_CLIQUE);
-};
-
-/**
- * @}
- */
-
-}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
index b0827d8e11..44525edb23 100644
--- a/cpp/include/raft/core/resources.hpp
+++ b/cpp/include/raft/core/resources.hpp
@@ -72,6 +72,7 @@ class resources {
   resources(const resources& res) : factories_(res.factories_), resources_(res.resources_) {}
   resources(resources&&)            = delete;
   resources& operator=(resources&&) = delete;
+  virtual ~resources() {}
 
   /**
    * @brief Returns true if a resource_factory has been registered for the
@@ -79,7 +80,7 @@ class resources {
    * @param resource_type resource type to check
    * @return true if resource_factory is registered for the given resource_type
    */
-  bool has_resource_factory(resource::resource_type resource_type) const
+  virtual bool has_resource_factory(resource::resource_type resource_type) const
   {
     std::lock_guard<std::mutex> _(mutex_);
     return factories_.at(resource_type).first != resource::resource_type::LAST_KEY;
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
index 0da11acae6..3c242af848 100644
--- a/docs/source/cpp_api/core_resources.rst
+++ b/docs/source/cpp_api/core_resources.rst
@@ -55,6 +55,23 @@ namespace *raft::core*
     :project: RAFT
     :members:
 
+SNMG Device Resources
+---------------------
+
+The `raft::device_resources_snmg` provides a convenient way to design SNMG
+(single-node multi-GPU) algorithms. It initiates device-related resources
+for a set of devices forming clique. This includes NCCL communications.
+GPUs can be addressed and exchanges be made over multiple threads
+for performance or convenience.
+
+``#include <raft/core/device_resources_snmg.hpp>``
+
+namespace *raft::core*
+
+.. doxygenclass:: raft::device_resources_snmg
+    :project: RAFT
+    :members:
+
 Resource Functions
 ------------------
 

From 8299f17621a26802714f5cb219f020c2783c8b6d Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 16 Jan 2025 17:15:41 -0600
Subject: [PATCH 23/37] introduce libraft wheels (#2531)

Replaces #2306, contributes to
https://github.com/rapidsai/build-planning/issues/33.

Proposes packaging `libraft` as a wheel, which is then re-used by:

* `pylibraft-cu{11,12}` and `raft-cu{11,12}` (this PR)
* `libcugraph-cu{11,12}`, `pylibcugraph-cu{11,12}`, and
`cugraph-cu{11,12}` in https://github.com/rapidsai/cugraph/pull/4804
* `libcuml-cu{11,12}` and `cuml-cu{11,12}` in
https://github.com/rapidsai/cuml/pull/6199

As part of this, also proposes:

* introducing a new CMake option, `RAFT_COMPILE_DYNAMIC_ONLY`, to allow
building/installing only the dynamic shared library (i.e. skipping the
static library)
* enforcing `rapids-cmake`'s preferred CMake style
(https://github.com/rapidsai/raft/pull/2531#discussion_r1917039870)
* making wheel-building CI jobs always depend on other wheel-building CI
jobs, not tests or `*-publish` (to reduce end-to-end CI time)

## Notes for Reviewers

### Benefits of these changes

* smaller wheels (see "Size Changes" below)
* faster compile times (no more re-compiling RAFT in cuGraph and cuML
CI)
* other benefits mentioned in
https://github.com/rapidsai/build-planning/issues/33

### Wheel contents

`libraft`:

* `libraft.so` (shared library)
* RAFT headers
* vendored dependencies (`fmt`, CCCL, `cuco`, `cute`, `cutlass`)

`pylibraft`:

* `pylibraft` Python / Cython code and compiled Cython extensions

`raft-dask`:

* `raft-dask` Python / Cython code and compiled Cython extension

### Dependency Flows

In short.... `libraft` contains a `libraft.so` dynamic library and the
headers to link against it.

* Anything that needs to link against RAFT at build time pulls in
`libraft` wheels as a build dependency.
* Anything that needs RAFT's symbols at runtime pulls it in as a runtime
dependency, and calls `libraft.load_library()`.

For more details and some flowcharts, see
https://github.com/rapidsai/build-planning/issues/33#issuecomment-2590129852

### Size changes (CUDA 12, Python 3.12, x86_64)

| wheel | num files (before) | num files (these PRs) | size (before) |
size (these PRs) |

|:---------------:|------------------:|-----------------:|--------------:|-------------:|
| `libraft`. | --- | 3169 | --- | 19M |
| `pylibraft` | 64 | 63 | 11M | 1M |
| `raft-dask` | 29 | 28 | 188M | 188M |
| `libcugraph` | --- | 1762 | --- | 903M |
| `pylibcugraph` | 190 | 187 | 901M | 2M |
| `cugraph` | 315 | 313 | 899M | 3.0M |
| `libcuml` | --- | 1766 | --- | 289M |
| `cuml` | 442 | --- | 517M | --- |
|**TOTAL** | **1,040** | **7,268** | **2,516M** | **1,405M** |

*NOTES: size = compressed, "before" = 2025-01-13 nightlies*

<details><summary>how I calculated those (click me)</summary>

* `cugraph`: nightly commit =
https://github.com/rapidsai/cugraph/commit/8507cbf63db2f349136b266d3e6e787b189f45a0,
PR = https://github.com/rapidsai/cugraph/pull/4804
* `cuml`: nightly commit =
https://github.com/rapidsai/cuml/commit/7c715c494dff71274d0fdec774bdee12a7e78827,
PR = https://github.com/rapidsai/cuml/pull/6199
* `raft`: nightly commit =
https://github.com/rapidsai/raft/commit/1b62c4117a35b11ce3c830daae248e32ebf75e3f,
PR = this PR

```shell
docker run \
    --rm \
    --network host \
    --env RAPIDS_NIGHTLY_DATE=2025-01-13 \
    --env CUGRAPH_NIGHTLY_SHA=8507cbf63db2f349136b266d3e6e787b189f45a0 \
    --env CUGRAPH_PR="pull-request/4804" \
    --env CUGRAPH_PR_SHA="2ef32eaa006a84c0bd16220bb8e8af34198fbee8" \
    --env CUML_NIGHTLY_SHA=7c715c494dff71274d0fdec774bdee12a7e78827 \
    --env CUML_PR="pull-request/6199" \
    --env CUML_PR_SHA="2ef32eaa006a84c0bd16220bb8e8af34198fbee8" \
    --env RAFT_NIGHTLY_SHA=1b62c4117a35b11ce3c830daae248e32ebf75e3f \
    --env RAFT_PR="pull-request/2531" \
    --env RAFT_PR_SHA="0d6597b08919f2aae8ac268f1a68d6a8fe5beb4e" \
    --env RAPIDS_PY_CUDA_SUFFIX=cu12 \
    --env WHEEL_DIR_BEFORE=/tmp/wheels-before \
    --env WHEEL_DIR_AFTER=/tmp/wheels-after \
    -it rapidsai/ci-wheel:cuda12.5.1-rockylinux8-py3.12 \
    bash

# --- nightly wheels --- #
mkdir -p ./wheels-before

export RAPIDS_BUILD_TYPE=branch
export RAPIDS_REF_NAME="branch-25.02"

# pylibraft
RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/raft \
RAPIDS_SHA=${RAFT_NIGHTLY_SHA} \
    rapids-download-wheels-from-s3 python ./wheels-before

# raft-dask
RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/raft \
RAPIDS_SHA=${RAFT_NIGHTLY_SHA} \
    rapids-download-wheels-from-s3 python ./wheels-before

# cugraph
RAPIDS_PY_WHEEL_NAME="cugraph_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cugraph \
RAPIDS_SHA=${CUGRAPH_NIGHTLY_SHA} \
    rapids-download-wheels-from-s3 python ./wheels-before

# pylibcugraph
RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cugraph \
RAPIDS_SHA=${CUGRAPH_NIGHTLY_SHA} \
    rapids-download-wheels-from-s3 python ./wheels-before

# cuml
RAPIDS_PY_WHEEL_NAME="cuml_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cuml \
RAPIDS_SHA=${CUML_NIGHTLY_SHA} \
    rapids-download-wheels-from-s3 python ./wheels-before

# --- wheels from CI --- #
mkdir -p ./wheels-after

export RAPIDS_BUILD_TYPE="pull-request"

# libraft
RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/raft \
RAPIDS_REF_NAME="${RAFT_PR}" \
RAPIDS_SHA="${RAFT_PR_SHA}" \
    rapids-download-wheels-from-s3 cpp ./wheels-after

# pylibraft
RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/raft \
RAPIDS_REF_NAME="${RAFT_PR}" \
RAPIDS_SHA="${RAFT_PR_SHA}" \
    rapids-download-wheels-from-s3 python ./wheels-after

# raft-dask
RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/raft \
RAPIDS_REF_NAME="${RAFT_PR}" \
RAPIDS_SHA="${RAFT_PR_SHA}" \
    rapids-download-wheels-from-s3 python ./wheels-after

# libcugraph
RAPIDS_PY_WHEEL_NAME="libcugraph_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cugraph \
RAPIDS_REF_NAME="${CUGRAPH_PR}" \
RAPIDS_SHA="${CUGRAPH_PR_SHA}" \
    rapids-download-wheels-from-s3 cpp ./wheels-after

# pylibcugraph
RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cugraph \
RAPIDS_REF_NAME="${CUGRAPH_PR}" \
RAPIDS_SHA="${CUGRAPH_PR_SHA}" \
    rapids-download-wheels-from-s3 python ./wheels-after

# cugraph
RAPIDS_PY_WHEEL_NAME="cugraph_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cugraph \
RAPIDS_REF_NAME="${CUGRAPH_PR}" \
RAPIDS_SHA="${CUGRAPH_PR_SHA}" \
    rapids-download-wheels-from-s3 python ./wheels-after

# libcuml
RAPIDS_PY_WHEEL_NAME="libcuml_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cuml \
RAPIDS_REF_NAME="${CUML_PR}" \
RAPIDS_SHA="${CUML_PR_SHA}" \
    rapids-download-wheels-from-s3 cpp ./wheels-after

# cuml
RAPIDS_PY_WHEEL_NAME="cuml_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cuml \
RAPIDS_REF_NAME="${CUML_PR}" \
RAPIDS_SHA="${CUML_PR_SHA}" \
    rapids-download-wheels-from-s3 python ./wheels-after

pip install pydistcheck
pydistcheck \
    --inspect \
    --select 'distro-too-large-compressed' \
    ./wheels-before/*.whl \
| grep -E '^checking|files: | compressed' \
> ./before.txt

# get more exact sizes
du -sh ./wheels-before/*

pydistcheck \
    --inspect \
    --select 'distro-too-large-compressed' \
    ./wheels-after/*.whl \
| grep -E '^checking|files: | compressed' \
> ./after.txt

# get more exact sizes
du -sh ./wheels-after/*
```

</details>

### How I tested this

These other PRs:

* https://github.com/rapidsai/devcontainers/pull/435
* https://github.com/rapidsai/cugraph-gnn/pull/110
* https://github.com/rapidsai/cuml/pull/6199
* https://github.com/rapidsai/cugraph/pull/4804
---
 .github/workflows/build.yaml                  |  26 ++++
 .github/workflows/pr.yaml                     |  17 ++-
 build.sh                                      |   7 +-
 ci/build_wheel.sh                             |   9 +-
 ci/build_wheel_libraft.sh                     |  43 +++++++
 ci/build_wheel_pylibraft.sh                   |  21 ++--
 ci/build_wheel_raft_dask.sh                   |  14 ++-
 ci/check_style.sh                             |   7 ++
 ci/release/update-version.sh                  |   2 +
 ci/test_wheel_pylibraft.sh                    |   8 +-
 ci/test_wheel_raft_dask.sh                    |   8 +-
 ci/validate_wheel.sh                          |  16 +--
 cpp/CMakeLists.txt                            |  73 +++++++----
 cpp/cmake/modules/ConfigureCUDA.cmake         |   4 +-
 dependencies.yaml                             |  97 +++++++++++++--
 python/libraft/CMakeLists.txt                 |  65 ++++++++++
 python/libraft/LICENSE                        |   1 +
 python/libraft/README.md                      |   1 +
 python/libraft/libraft/VERSION                |   1 +
 python/libraft/libraft/__init__.py            |  16 +++
 python/libraft/libraft/_version.py            |  33 +++++
 python/libraft/libraft/load.py                |  80 ++++++++++++
 python/libraft/pyproject.toml                 | 115 ++++++++++++++++++
 python/pylibraft/CMakeLists.txt               |  59 +--------
 python/pylibraft/pylibraft/__init__.py        |  11 ++
 .../pylibraft/pylibraft/common/CMakeLists.txt |   2 +-
 .../pylibraft/pylibraft/random/CMakeLists.txt |   2 +-
 .../pylibraft/sparse/linalg/CMakeLists.txt    |   2 +-
 python/pylibraft/pyproject.toml               |  13 +-
 python/raft-dask/CMakeLists.txt               |  32 +----
 .../raft-dask/cmake/thirdparty/get_ucxx.cmake |   4 +-
 python/raft-dask/pyproject.toml               |   3 +
 python/raft-dask/raft_dask/__init__.py        |  15 ++-
 .../raft-dask/raft_dask/common/CMakeLists.txt |   3 +-
 .../raft_dask/include_test/CMakeLists.txt     |   3 +-
 rapids_config.cmake                           |   6 +-
 36 files changed, 643 insertions(+), 176 deletions(-)
 create mode 100755 ci/build_wheel_libraft.sh
 create mode 100644 python/libraft/CMakeLists.txt
 create mode 120000 python/libraft/LICENSE
 create mode 120000 python/libraft/README.md
 create mode 120000 python/libraft/libraft/VERSION
 create mode 100644 python/libraft/libraft/__init__.py
 create mode 100644 python/libraft/libraft/_version.py
 create mode 100644 python/libraft/libraft/load.py
 create mode 100644 python/libraft/pyproject.toml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 5f80d8cfda..d484bcae22 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -66,7 +66,30 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-libraft:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libraft.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-publish-libraft:
+    needs: wheel-build-libraft
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: libraft
+      package-type: cpp
   wheel-build-pylibraft:
+    needs: wheel-build-libraft
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -85,7 +108,9 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: pylibraft
+      package-type: python
   wheel-build-raft-dask:
+    needs: wheel-build-libraft
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -104,3 +129,4 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: raft_dask
+      package-type: python
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a270df1dfa..9a51c783e9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -21,6 +21,7 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
+      - wheel-build-libraft
       - wheel-build-pylibraft
       - wheel-tests-pylibraft
       - wheel-build-raft-dask
@@ -116,10 +117,22 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-pylibraft:
+  wheel-build-libraft:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: pull-request
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libraft.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-build-pylibraft:
+    needs: [checks, wheel-build-libraft]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
@@ -132,7 +145,7 @@ jobs:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
-    needs: wheel-tests-pylibraft
+    needs: [checks, wheel-build-libraft]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
diff --git a/build.sh b/build.sh
index a95cb8ee23..de3ebfa3c5 100755
--- a/build.sh
+++ b/build.sh
@@ -347,13 +347,8 @@ if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
 
-# Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
-SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
-    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
-fi
 # Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
-SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
+SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
 
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 326ee9a4c7..4c295c416e 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -5,6 +5,7 @@ set -euo pipefail
 
 package_name=$1
 package_dir=$2
+package_type=$3
 underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
 # Clear out system ucx files to ensure that we're getting ucx from the wheel.
@@ -39,6 +40,12 @@ case "${RAPIDS_CUDA_VERSION}" in
   ;;
 esac
 
+if [[ ${package_name} != "libraft" ]]; then
+    EXCLUDE_ARGS+=(
+      --exclude "libraft.so"
+    )
+fi
+
 sccache --zero-stats
 
 rapids-logger "Building '${package_name}' wheel"
@@ -55,4 +62,4 @@ sccache --show-adv-stats
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_type} final_dist
diff --git a/ci/build_wheel_libraft.sh b/ci/build_wheel_libraft.sh
new file mode 100755
index 0000000000..825a5124a8
--- /dev/null
+++ b/ci/build_wheel_libraft.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name="libraft"
+package_dir="python/libraft"
+
+rapids-logger "Generating build requirements"
+matrix_selectors="cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "${matrix_selectors}" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXTRA_CMAKE_ARGS="-DUSE_CUDA_MATH_WHEELS=ON"
+  ;;
+  11.*)
+    EXTRA_CMAKE_ARGS="-DUSE_CUDA_MATH_WHEELS=OFF"
+  ;;
+esac
+
+export SKBUILD_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
+
+ci/build_wheel.sh libraft ${package_dir} cpp
+ci/validate_wheel.sh ${package_dir} final_dist libraft
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index dd62ab5399..6f74e0e8c5 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -5,17 +5,16 @@ set -euo pipefail
 
 package_dir="python/pylibraft"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
-  ;;
-  11.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
-  ;;
-esac
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF${EXTRA_CMAKE_ARGS}"
+# Downloads libraft wheels from this current build,
+# then ensures 'pylibraft' wheel builds always use the 'libraft' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libraft_dist
+echo "libraft-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libraft_dist/libraft_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-ci/build_wheel.sh pylibraft ${package_dir}
+ci/build_wheel.sh pylibraft ${package_dir} python
 ci/validate_wheel.sh ${package_dir} final_dist pylibraft
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
index d49d131abf..0cacb6fe30 100755
--- a/ci/build_wheel_raft_dask.sh
+++ b/ci/build_wheel_raft_dask.sh
@@ -5,8 +5,16 @@ set -euo pipefail
 
 package_dir="python/raft-dask"
 
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-ci/build_wheel.sh raft-dask ${package_dir}
+# Downloads libraft wheels from this current build,
+# then ensures 'raft-dask' wheel builds always use the 'libraft' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libraft_dist
+echo "libraft-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libraft_dist/libraft_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
+
+ci/build_wheel.sh raft-dask ${package_dir} python
 ci/validate_wheel.sh ${package_dir} final_dist raft-dask
diff --git a/ci/check_style.sh b/ci/check_style.sh
index d7ba4cae25..e0c30a2d41 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,5 +14,12 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
+# get config for cmake-format checks
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+FORMAT_FILE_URL="https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json"
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
+
 # Run pre-commit checks
 pre-commit run --all-files --show-diff-on-failure
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a70fed9ec8..1ab9157b89 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,8 @@ echo "${NEXT_FULL_TAG}" > VERSION
 
 DEPENDENCIES=(
   dask-cuda
+  libraft
+  librmm
   pylibraft
   rmm
   rapids-dask-dependency
diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index b38f5a690b..1e0b34d609 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -5,9 +5,13 @@ set -euo pipefail
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libraft-dep
+RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+
 
 # echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/pylibraft*.whl)[test]
+python -m pip install \
+    ./local-libraft-dep/libraft*.whl \
+    "$(echo ./dist/pylibraft*.whl)[test]"
 
 python -m pytest ./python/pylibraft/pylibraft/test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index a778a3ec51..011de4d409 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -5,13 +5,13 @@ set -euo pipefail
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-
-# Download the pylibraft built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libraft-dep
+RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibraft-dep
+RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install -v \
+    ./local-libraft-dep/libraft*.whl \
     ./local-pylibraft-dep/pylibraft*.whl \
     "$(echo ./dist/raft_dask_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 5ef72ad895..ca506af004 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -10,23 +10,17 @@ package_name=$3
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
 # some packages are much larger on CUDA 11 than on CUDA 12
-if [[ "${package_name}" == "raft-dask" ]]; then
-    PYDISTCHECK_ARGS=(
-        --max-allowed-size-compressed '200M'
-    )
-elif [[ "${package_name}" == "pylibraft" ]]; then
+PYDISTCHECK_ARGS=()
+if [[ "${package_name}" == "libraft" ]]; then
     if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
-        PYDISTCHECK_ARGS=(
-            --max-allowed-size-compressed '600M'
+        PYDISTCHECK_ARGS+=(
+            --max-allowed-size-compressed '750M'
         )
     else
-        PYDISTCHECK_ARGS=(
+        PYDISTCHECK_ARGS+=(
             --max-allowed-size-compressed '100M'
         )
     fi
-else
-    echo "Unsupported package name: ${package_name}"
-    exit 1
 fi
 
 cd "${package_dir}"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 621f9fcef2..eb7e8540f0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -65,9 +65,12 @@ set(RAFT_COMPILE_LIBRARY_DEFAULT OFF)
 if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
 endif()
-option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
+option(RAFT_COMPILE_LIBRARY "Enable building raft library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
+option(RAFT_COMPILE_DYNAMIC_ONLY "Only build the shared library and skip the
+static library. Has no effect if RAFT_COMPILE_LIBRARY is OFF" OFF
+)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
 # have different values for the `Threads::Threads` target. Setting this flag ensures
@@ -311,17 +314,23 @@ if(RAFT_COMPILE_LIBRARY)
   # Make sure not to add the rmm logger twice since it will be brought in as an interface source by
   # the rmm::rmm_logger_impl target.
   add_library(raft_lib SHARED $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
-  add_library(raft_lib_static STATIC $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
+
+  set(_raft_lib_targets raft_lib)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    add_library(raft_lib_static STATIC $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
+    list(APPEND _raft_lib_targets raft_lib_static)
+  endif()
 
   set_target_properties(
-    raft_lib raft_lib_static
+    ${_raft_lib_targets}
     PROPERTIES OUTPUT_NAME raft
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  foreach(target raft_lib raft_lib_static raft_objs)
+  list(APPEND _raft_lib_targets raft_objs)
+  foreach(target IN LISTS _raft_lib_targets)
     target_link_libraries(
       ${target}
       PUBLIC raft::raft
@@ -336,7 +345,9 @@ if(RAFT_COMPILE_LIBRARY)
     target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
   endforeach()
   target_link_libraries(raft_lib PRIVATE rmm::rmm_logger_impl raft_logger_impl)
-  target_link_libraries(raft_lib_static PRIVATE rmm::rmm_logger_impl raft_logger_impl)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    target_link_libraries(raft_lib_static PRIVATE rmm::rmm_logger_impl raft_logger_impl)
+  endif()
 endif()
 
 if(TARGET raft_lib AND (NOT TARGET raft::raft_lib))
@@ -348,20 +359,22 @@ target_link_libraries(raft_compiled INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS
 # ##################################################################################################
 # * raft_compiled_static----------------------------------------------------------------------------
 
-add_library(raft_compiled_static INTERFACE)
+if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+  add_library(raft_compiled_static INTERFACE)
 
-if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
-  add_library(raft::compiled_static ALIAS raft_compiled_static)
-endif()
-set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
+  if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
+    add_library(raft::compiled_static ALIAS raft_compiled_static)
+  endif()
+  set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
 
-if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
-  add_library(raft::raft_lib_static ALIAS raft_lib_static)
-endif()
+  if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
+    add_library(raft::raft_lib_static ALIAS raft_lib_static)
+  endif()
 
-target_link_libraries(
-  raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
-)
+  target_link_libraries(
+    raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
+  )
+endif()
 
 # ##################################################################################################
 # * raft_distributed -------------------------------------------------------------------------------
@@ -410,8 +423,12 @@ install(
   EXPORT raft-exports
 )
 
+set(_raft_compiled_install_targets raft_compiled)
+if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+  list(APPEND _raft_compiled_install_targets raft_compiled_static)
+endif()
 install(
-  TARGETS raft_compiled raft_compiled_static
+  TARGETS ${_raft_compiled_install_targets}
   DESTINATION ${lib_dir}
   COMPONENT raft
   EXPORT raft-compiled-exports
@@ -424,12 +441,14 @@ if(TARGET raft_lib)
     COMPONENT compiled
     EXPORT raft-compiled-lib-exports
   )
-  install(
-    TARGETS raft_lib_static
-    DESTINATION ${lib_dir}
-    COMPONENT compiled-static
-    EXPORT raft-compiled-static-lib-exports
-  )
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    install(
+      TARGETS raft_lib_static
+      DESTINATION ${lib_dir}
+      COMPONENT compiled-static
+      EXPORT raft-compiled-static-lib-exports
+    )
+  endif()
   install(
     DIRECTORY include/raft_runtime
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
@@ -500,8 +519,12 @@ endif()
 set(raft_components compiled distributed)
 set(raft_export_sets raft-compiled-exports raft-distributed-exports)
 if(TARGET raft_lib)
-  list(APPEND raft_components compiled compiled-static)
-  list(APPEND raft_export_sets raft-compiled-lib-exports raft-compiled-static-lib-exports)
+  list(APPEND raft_components compiled)
+  list(APPEND raft_export_sets raft-compiled-lib-exports)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    list(APPEND raft_components compiled-static)
+    list(APPEND raft_export_sets raft-compiled-static-lib-exports)
+  endif()
 endif()
 
 string(
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index b364d8418d..25b9b0ddf8 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -14,7 +14,9 @@
 
 if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
-  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations
+       -DRAFT_HIDE_DEPRECATION_WARNINGS
+  )
 endif()
 
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
diff --git a/dependencies.yaml b/dependencies.yaml
index 689cf8414c..44c240b6ce 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,6 +6,8 @@ files:
       cuda: ["11.8", "12.5"]
       arch: [x86_64, aarch64]
     includes:
+      - build_common
+      - build_cython
       - checks
       - cuda
       - cuda_version
@@ -15,7 +17,6 @@ files:
       - depends_on_rmm
       - develop
       - docs
-      - rapids_build
       - rapids_build_skbuild
       - run_pylibraft
       - run_raft_dask
@@ -48,6 +49,29 @@ files:
       - docs
       - py_version
       - test_pylibraft
+  py_build_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_rapids_build_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - build_common
+      - depends_on_librmm
+  py_run_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: project
+    includes:
+      - cuda_wheels
   py_build_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -62,16 +86,19 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
+      - build_common
+      - build_cython
+      - depends_on_libraft
+      - depends_on_librmm
       - depends_on_cuda_python
       - depends_on_rmm
-      - rapids_build
   py_run_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
     extras:
       table: project
     includes:
-      - cuda_wheels
+      - depends_on_libraft
       - depends_on_cuda_python
       - depends_on_rmm
       - run_pylibraft
@@ -99,8 +126,11 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
+      - build_common
+      - build_cython
+      - depends_on_libraft
+      - depends_on_librmm
       - depends_on_ucx_build
-      - rapids_build
   py_run_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -108,6 +138,7 @@ files:
       table: project
     includes:
       - depends_on_distributed_ucxx
+      - depends_on_libraft
       - run_raft_dask
   py_test_raft_dask:
     output: pyproject
@@ -135,12 +166,11 @@ dependencies:
       - output_types: [requirements, pyproject]
         packages:
           - scikit-build-core[pyproject]>=0.10.0
-  rapids_build:
+  build_common:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4,!=3.30.0
-          - cython>=3.0.0,<3.1.0a0
           - ninja
       - output_types: [conda]
         packages:
@@ -182,7 +212,11 @@ dependencies:
             packages: [nvcc_linux-64=11.2]
           - matrix: {cuda: "11.2", arch: aarch64}
             packages: [nvcc_linux-aarch64=11.2]
-
+  build_cython:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cython>=3.0.0,<3.1.0a0
   checks:
     common:
       - output_types: [conda, requirements]
@@ -471,6 +505,55 @@ dependencies:
             packages:
               - distributed-ucxx-cu11==0.42.*,>=0.0.0a0
           - {matrix: null, packages: [*distributed_ucxx_unsuffixed]}
+  depends_on_libraft:
+    common:
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libraft-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libraft-cu11==25.2.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - libraft==25.2.*,>=0.0.0a0
+  depends_on_librmm:
+    common:
+      - output_types: conda
+        packages:
+          - &librmm_unsuffixed librmm==25.2.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu11==25.2.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *librmm_unsuffixed
   depends_on_rmm:
     common:
       - output_types: conda
diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt
new file mode 100644
index 0000000000..57efcd61ab
--- /dev/null
+++ b/python/libraft/CMakeLists.txt
@@ -0,0 +1,65 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+project(
+  libraft-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX
+)
+
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
+
+# Check if raft is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python raft package.
+find_package(raft "${RAPIDS_VERSION}")
+
+if(raft_FOUND)
+  return()
+endif()
+
+unset(raft_FOUND)
+
+# --- CUDA --- #
+find_package(CUDAToolkit REQUIRED)
+set(CUDA_STATIC_RUNTIME ON)
+set(CUDA_STATIC_MATH_LIBRARIES ON)
+if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+  set(CUDA_STATIC_MATH_LIBRARIES OFF)
+elseif(USE_CUDA_MATH_WHEELS)
+  message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+endif()
+
+# --- RAFT ---#
+set(BUILD_TESTS OFF)
+set(BUILD_PRIMS_BENCH OFF)
+set(RAFT_COMPILE_DYNAMIC_ONLY ON)
+set(RAFT_COMPILE_LIBRARY ON)
+
+add_subdirectory(../../cpp raft-cpp)
+
+if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+  set_property(
+    TARGET raft_lib
+    PROPERTY INSTALL_RPATH
+             "$ORIGIN/../nvidia/cublas/lib"
+             "$ORIGIN/../nvidia/curand/lib"
+             "$ORIGIN/../nvidia/cusolver/lib"
+             "$ORIGIN/../nvidia/cusparse/lib"
+             "$ORIGIN/../nvidia/nvjitlink/lib"
+  )
+endif()
diff --git a/python/libraft/LICENSE b/python/libraft/LICENSE
new file mode 120000
index 0000000000..30cff7403d
--- /dev/null
+++ b/python/libraft/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libraft/README.md b/python/libraft/README.md
new file mode 120000
index 0000000000..fe84005413
--- /dev/null
+++ b/python/libraft/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/libraft/libraft/VERSION b/python/libraft/libraft/VERSION
new file mode 120000
index 0000000000..d62dc733ef
--- /dev/null
+++ b/python/libraft/libraft/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libraft/libraft/__init__.py b/python/libraft/libraft/__init__.py
new file mode 100644
index 0000000000..9260f4e67c
--- /dev/null
+++ b/python/libraft/libraft/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libraft._version import __git_commit__, __version__
+from libraft.load import load_library
diff --git a/python/libraft/libraft/_version.py b/python/libraft/libraft/_version.py
new file mode 100644
index 0000000000..530bf8bea6
--- /dev/null
+++ b/python/libraft/libraft/_version.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/libraft/libraft/load.py b/python/libraft/libraft/load.py
new file mode 100644
index 0000000000..ad3db9e09c
--- /dev/null
+++ b/python/libraft/libraft/load.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+# Loading with RTLD_LOCAL adds the library itself to the loader's
+# loaded library cache without loading any symbols into the global
+# namespace. This allows libraries that express a dependency on
+# this library to be loaded later and successfully satisfy this dependency
+# without polluting the global symbol table with symbols from
+# libraft that could conflict with symbols from other DSOs.
+PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL
+
+
+def _load_system_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Raises ``OSError`` if library cannot be loaded.
+    """
+    return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG)
+
+
+def _load_wheel_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Returns ``None`` if the library cannot be loaded.
+    """
+    if os.path.isfile(
+        lib := os.path.join(os.path.dirname(__file__), "lib64", soname)
+    ):
+        return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG)
+    return None
+
+
+def load_library():
+    """Dynamically load libraft.so and its dependencies"""
+    prefer_system_installation = (
+        os.getenv("RAPIDS_LIBRAFT_PREFER_SYSTEM_LIBRARY", "false").lower()
+        != "false"
+    )
+
+    soname = "libraft.so"
+    libraft_lib = None
+    if prefer_system_installation:
+        # Prefer a system library if one is present to
+        # avoid clobbering symbols that other packages might expect, but if no
+        # other library is present use the one in the wheel.
+        try:
+            libraft_lib = _load_system_installation(soname)
+        except OSError:
+            libraft_lib = _load_wheel_installation(soname)
+    else:
+        # Prefer the libraries bundled in this package. If they aren't found
+        # (which might be the case in builds where the library was prebuilt
+        # before packaging the wheel), look for a system installation.
+        try:
+            libraft_lib = _load_wheel_installation(soname)
+            if libraft_lib is None:
+                libraft_lib = _load_system_installation(soname)
+        except OSError:
+            # If none of the searches above succeed, just silently return None
+            # and rely on other mechanisms (like RPATHs on other DSOs) to
+            # help the loader find the library.
+            pass
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libraft was loaded from.
+    return libraft_lib
diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml
new file mode 100644
index 0000000000..549a1bf651
--- /dev/null
+++ b/python/libraft/pyproject.toml
@@ -0,0 +1,115 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+build-backend = "rapids_build_backend.build"
+
+[project]
+name = "libraft"
+dynamic = ["version"]
+description = "RAFT: Reusable Algorithms Functions and other Tools (C++)"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.10"
+dependencies = [
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/raft"
+Documentation = "https://docs.rapids.ai/api/raft/stable/"
+
+[project.entry-points."cmake.prefix"]
+libraft = "libraft"
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_first_party = [
+    "libraft",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.install-dir = "libraft"
+wheel.packages = ["libraft"]
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libraft/VERSION"
+regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "librmm==25.2.*,>=0.0.0a0",
+    "ninja",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+
+[tool.pydistcheck]
+select = [
+    # NOTE: size threshold is managed via CLI args in CI scripts
+    "distro-too-large-compressed",
+]
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 758c1e4711..83c262dc10 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -27,68 +27,13 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       ON
-)
-option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
-
-# If the user requested it we attempt to find RAFT.
-if(FIND_RAFT_CPP)
-  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS compiled)
-  if(NOT TARGET raft::raft_lib)
-    message(
-      FATAL_ERROR
-        "Building against a preexisting libraft library requires the compiled libraft to have been built!"
-    )
-
-  endif()
-else()
-  set(raft_FOUND OFF)
-endif()
+# an installed version of raft contains the other necessary targets (like CCCL and cuco)
+find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS raft compiled)
 
 include(rapids-cython-core)
 
-if(NOT raft_FOUND)
-  find_package(CUDAToolkit REQUIRED)
-
-  set(BUILD_TESTS OFF)
-  set(BUILD_PRIMS_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARY ON)
-  set(CUDA_STATIC_RUNTIME ON)
-  set(CUDA_STATIC_MATH_LIBRARIES ON)
-  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
-    set(CUDA_STATIC_MATH_LIBRARIES OFF)
-  elseif(USE_CUDA_MATH_WHEELS)
-    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
-  endif()
-
-  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
-
-  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
-    set_property(
-      TARGET raft_lib
-      PROPERTY INSTALL_RPATH
-               "$ORIGIN/../nvidia/cublas/lib"
-               "$ORIGIN/../nvidia/curand/lib"
-               "$ORIGIN/../nvidia/cusolver/lib"
-               "$ORIGIN/../nvidia/cusparse/lib"
-               "$ORIGIN/../nvidia/nvjitlink/lib"
-    )
-  endif()
-
-  # When building the C++ libraries from source we must copy libraft.so alongside the
-  # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
-  # library, we shouldn't need this
-  set(cython_lib_dir pylibraft)
-  install(TARGETS raft_lib DESTINATION ${cython_lib_dir})
-endif()
-
 rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/sparse)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET raft PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index b0869501f3..a01e02ec33 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -13,4 +13,15 @@
 # limitations under the License.
 #
 
+# If libraft was installed as a wheel, we must request it to load the library
+# symbols. Otherwise, we assume that the library was installed in a system path that ld
+# can find.
+try:
+    import libraft
+except ModuleNotFoundError:
+    pass
+else:
+    libraft.load_library()
+    del libraft
+
 from pylibraft._version import __git_commit__, __version__
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index 53279bfaf7..d1c1acb3aa 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX common_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX common_
 )
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index 10ff776471..7d61855111 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -23,5 +23,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX random_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX random_
 )
diff --git a/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt b/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt
index ef16981644..7b2c9f6162 100644
--- a/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt
@@ -23,5 +23,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX sparse_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX sparse_
 )
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index ba454af591..912f1ad947 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -32,11 +32,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cuda-python",
+    "libraft==25.2.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
-    "nvidia-cublas",
-    "nvidia-curand",
-    "nvidia-cusolver",
-    "nvidia-cusparse",
     "rmm==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -124,18 +121,22 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cuda-python",
     "cython>=3.0.0,<3.1.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
     "ninja",
     "rmm==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.pydistcheck]
 select = [
-    # NOTE: size threshold is managed via CLI args in CI scripts
     "distro-too-large-compressed",
 ]
 
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 9ebbaa5298..1fcb40a58d 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -25,38 +25,16 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       OFF
-)
-
 rapids_cpm_init()
 # Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can remove this.
 find_package(ucx REQUIRED)
 include(cmake/thirdparty/get_ucxx.cmake)
 
-# If the user requested it we attempt to find RAFT.
-if(FIND_RAFT_CPP)
-  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
-else()
-  set(raft_FOUND OFF)
-endif()
-
-if(NOT raft_FOUND)
-  # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
-  # library compilation and we don't need to install anything here.
-  set(BUILD_TESTS OFF)
-  set(BUILD_PRIMS_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARIES OFF)
-  set(RAFT_COMPILE_DIST_LIBRARY OFF)
-  set(RAFT_COMPILE_NN_LIBRARY OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-  set(CUDA_STATIC_MATH_LIBRARIES ON)
-  set(RAFT_DASK_UCXX_STATIC ON)
-
-  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
-  list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
-  find_package(NCCL REQUIRED)
-endif()
+# why these components:
+#
+# * 'raft' = the headers, needed to link against libraft
+# * 'distributed' = needed for NCCL
+find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS raft distributed)
 
 include(rapids-cython-core)
 rapids_cython_init()
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
index f5daf70f92..e6b9c4aa0e 100644
--- a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -45,8 +45,8 @@ function(find_and_configure_ucxx)
 endfunction()
 
 # Change pinned tag here to test a commit in CI
-# To use a different RAFT locally, set the CMake variable
-# CPM_raft_SOURCE=/path/to/local/raft
+# To use a different ucxx locally, set the CMake variable
+# CPM_ucxx_SOURCE=/path/to/local/ucxx
 find_and_configure_ucxx(VERSION  0.42
         FORK             rapidsai
         PINNED_TAG       branch-0.42
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index cabe8e72a6..d3a26db282 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -33,6 +33,7 @@ requires-python = ">=3.10"
 dependencies = [
     "dask-cuda==25.2.*,>=0.0.0a0",
     "distributed-ucxx==0.42.*,>=0.0.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
     "pylibraft==25.2.*,>=0.0.0a0",
     "rapids-dask-dependency==25.2.*,>=0.0.0a0",
     "ucx-py==0.42.*,>=0.0.0a0",
@@ -119,6 +120,8 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.0,<3.1.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
     "libucx==1.15.0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 19a037ae75..78248fad7a 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 #
 
-from raft_dask._version import __git_commit__, __version__
-
 # If libucx was installed as a wheel, we must request it to load the library symbols.
 # Otherwise, we assume that the library was installed in a system path that ld can find.
 try:
@@ -24,3 +22,16 @@
 else:
     libucx.load_library()
     del libucx
+
+# If libraft was installed as a wheel, we must request it to load the library
+# symbols. Otherwise, we assume that the library was installed in a system path that ld
+# can find.
+try:
+    import libraft
+except ModuleNotFoundError:
+    pass
+else:
+    libraft.load_library()
+    del libraft
+
+from raft_dask._version import __git_commit__, __version__
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 65d5f06577..1279d5d501 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -15,6 +15,5 @@
 set(cython_sources comms_utils.pyx nccl.pyx)
 set(linked_libraries raft::raft raft::distributed)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
-                                                                            CXX
+  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX
 )
diff --git a/python/raft-dask/raft_dask/include_test/CMakeLists.txt b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
index 2ff1cd9150..8839c57b91 100644
--- a/python/raft-dask/raft_dask/include_test/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
@@ -15,6 +15,5 @@
 set(cython_sources raft_include_test.pyx)
 set(linked_libraries raft::raft)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
-                                                                            CXX
+  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX
 )
diff --git a/rapids_config.cmake b/rapids_config.cmake
index c8077f7f4b..a40d7130c0 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -22,13 +22,15 @@ else()
   string(REPLACE "\n" "\n  " _rapids_version_formatted "  ${_rapids_version}")
   message(
     FATAL_ERROR
-      "Could not determine RAPIDS version. Contents of VERSION file:\n${_rapids_version_formatted}")
+      "Could not determine RAPIDS version. Contents of VERSION file:\n${_rapids_version_formatted}"
+  )
 endif()
 
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
   file(
     DOWNLOAD
     "https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/RAPIDS.cmake"
-    "${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
+    "${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake"
+  )
 endif()
 include("${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")

From 097ac45fd98f61109a943b2f33757d77532edb17 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 16 Jan 2025 23:25:14 -0500
Subject: [PATCH 24/37] Revert "Introduction of the
 `raft::device_resources_snmg` type (#2487)" (#2543)

This reverts commit fb6bfe6ee956a5e40295300d453f1261ece3cedf.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2543
---
 cpp/include/raft/comms/nccl_clique.hpp        | 156 +++++++++++++
 .../raft/core/device_resources_snmg.hpp       | 217 ------------------
 .../raft/core/resource/nccl_clique.hpp        |  66 ++++++
 cpp/include/raft/core/resources.hpp           |   3 +-
 docs/source/cpp_api/core_resources.rst        |  17 --
 5 files changed, 223 insertions(+), 236 deletions(-)
 create mode 100644 cpp/include/raft/comms/nccl_clique.hpp
 delete mode 100644 cpp/include/raft/core/device_resources_snmg.hpp
 create mode 100644 cpp/include/raft/core/resource/nccl_clique.hpp

diff --git a/cpp/include/raft/comms/nccl_clique.hpp b/cpp/include/raft/comms/nccl_clique.hpp
new file mode 100644
index 0000000000..c6520af753
--- /dev/null
+++ b/cpp/include/raft/comms/nccl_clique.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_resources.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <nccl.h>
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define RAFT_NCCL_TRY(call)                        \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
+  } while (0);
+
+namespace raft::comms {
+void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank);
+}
+
+namespace raft::comms {
+
+struct nccl_clique {
+  using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+
+  /**
+   * Instantiates a NCCL clique with all available GPUs
+   *
+   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+   *
+   */
+  nccl_clique(int percent_of_free_memory = 80)
+    : root_rank_(0),
+      percent_of_free_memory_(percent_of_free_memory),
+      per_device_pools_(0),
+      device_resources_(0)
+  {
+    cudaGetDeviceCount(&num_ranks_);
+    device_ids_.resize(num_ranks_);
+    std::iota(device_ids_.begin(), device_ids_.end(), 0);
+    nccl_comms_.resize(num_ranks_);
+    nccl_clique_init();
+  }
+
+  /**
+   * Instantiates a NCCL clique
+   *
+   * Usage example:
+   * @code{.cpp}
+   * int n_devices;
+   * cudaGetDeviceCount(&n_devices);
+   * std::vector<int> device_ids(n_devices);
+   * std::iota(device_ids.begin(), device_ids.end(), 0);
+   * cuvs::neighbors::mg::nccl_clique& clique(device_ids); // first device is the root rank
+   * @endcode
+   *
+   * @param[in] device_ids list of device IDs to be used to initiate the clique
+   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+   *
+   */
+  nccl_clique(const std::vector<int>& device_ids, int percent_of_free_memory = 80)
+    : root_rank_(0),
+      num_ranks_(device_ids.size()),
+      percent_of_free_memory_(percent_of_free_memory),
+      device_ids_(device_ids),
+      nccl_comms_(device_ids.size()),
+      per_device_pools_(0),
+      device_resources_(0)
+  {
+    nccl_clique_init();
+  }
+
+  void nccl_clique_init()
+  {
+    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), num_ranks_, device_ids_.data()));
+
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+
+      // create a pool memory resource for each device
+      auto old_mr = rmm::mr::get_current_device_resource();
+      per_device_pools_.push_back(std::make_unique<pool_mr>(
+        old_mr, rmm::percent_of_free_device_memory(percent_of_free_memory_)));
+      rmm::cuda_device_id id(device_ids_[rank]);
+      rmm::mr::set_per_device_resource(id, per_device_pools_.back().get());
+
+      // create a device resource handle for each device
+      device_resources_.emplace_back();
+
+      // add NCCL communications to the device resource handle
+      raft::comms::build_comms_nccl_only(
+        &device_resources_[rank], nccl_comms_[rank], num_ranks_, rank);
+    }
+
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+      raft::resource::sync_stream(device_resources_[rank]);
+    }
+  }
+
+  const raft::device_resources& set_current_device_to_root_rank() const
+  {
+    int root_device_id = device_ids_[root_rank_];
+    RAFT_CUDA_TRY(cudaSetDevice(root_device_id));
+    return device_resources_[root_rank_];
+  }
+
+  ~nccl_clique()
+  {
+#pragma omp parallel for  // necessary to avoid hangs
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      cudaSetDevice(device_ids_[rank]);
+      ncclCommDestroy(nccl_comms_[rank]);
+      rmm::cuda_device_id id(device_ids_[rank]);
+      rmm::mr::set_per_device_resource(id, nullptr);
+    }
+  }
+
+  int root_rank_;
+  int num_ranks_;
+  int percent_of_free_memory_;
+  std::vector<int> device_ids_;
+  std::vector<ncclComm_t> nccl_comms_;
+  std::vector<std::shared_ptr<pool_mr>> per_device_pools_;
+  std::vector<raft::device_resources> device_resources_;
+};
+
+}  // namespace raft::comms
diff --git a/cpp/include/raft/core/device_resources_snmg.hpp b/cpp/include/raft/core/device_resources_snmg.hpp
deleted file mode 100644
index f20a81a1c6..0000000000
--- a/cpp/include/raft/core/device_resources_snmg.hpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/core/device_resources.hpp>
-
-#include <nccl.h>
-#include <omp.h>
-
-#include <memory>
-#include <vector>
-
-/**
- * @brief Error checking macro for NCCL runtime API functions.
- *
- * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
- * exception detailing the NCCL error that occurred
- */
-#define RAFT_NCCL_TRY(call)                        \
-  do {                                             \
-    ncclResult_t const status = (call);            \
-    if (ncclSuccess != status) {                   \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "NCCL error encountered at: ", \
-                    "call='%s', Reason=%d:%s",     \
-                    #call,                         \
-                    status,                        \
-                    ncclGetErrorString(status));   \
-      throw raft::logic_error(msg);                \
-    }                                              \
-  } while (0);
-
-namespace raft {
-
-/**
- * @brief SNMG (single-node multi-GPU) resource container object that stores a NCCL clique and all
- * necessary resources used for calling device functions, cuda kernels, libraries and/or NCCL
- * communications on each GPU. Note the `device_resources_snmg` object can also be used as a classic
- * `device_resources` object. The associated resources will be the ones of the GPU used during
- * object instantiation and a GPU switch operation will be ordered during the retrieval of said
- * resources.
- *
- * The `device_resources_snmg` class is intended to be used in a single process to manage several
- * GPUs. Please note that NCCL communications are the responsibility of the user. Blocking NCCL
- * calls will sometimes require the use of several threads to avoid hangs.
- */
-class device_resources_snmg : public device_resources {
- public:
-  /**
-   * @brief Construct a SNMG resources instance with all available GPUs
-   */
-  device_resources_snmg() : device_resources(), root_rank_(0)
-  {
-    cudaGetDevice(&main_gpu_id_);
-
-    int num_ranks;
-    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_ranks));
-    device_ids_.resize(num_ranks);
-    std::iota(device_ids_.begin(), device_ids_.end(), 0);
-    nccl_comms_.resize(num_ranks);
-    initialize();
-  }
-
-  /**
-   * @brief Construct a SNMG resources instance with a subset of available GPUs
-   *
-   * @param[in] device_ids List of device IDs to be used by the NCCL clique
-   */
-  device_resources_snmg(const std::vector<int>& device_ids)
-    : device_resources(), root_rank_(0), device_ids_(device_ids), nccl_comms_(device_ids.size())
-  {
-    cudaGetDevice(&main_gpu_id_);
-
-    initialize();
-  }
-
-  /**
-   * @brief SNMG resources instance copy constructor
-   *
-   * @param[in] clique A SNMG resources instance
-   */
-  device_resources_snmg(const device_resources_snmg& clique)
-    : device_resources(clique),
-      root_rank_(clique.root_rank_),
-      main_gpu_id_(clique.main_gpu_id_),
-      device_ids_(clique.device_ids_),
-      nccl_comms_(clique.nccl_comms_),
-      device_resources_(clique.device_resources_)
-  {
-  }
-
-  device_resources_snmg(device_resources_snmg&&)            = delete;
-  device_resources_snmg& operator=(device_resources_snmg&&) = delete;
-
-  /**
-   * @brief Set root rank of NCCL clique
-   */
-  inline int set_root_rank(int rank) { this->root_rank_ = rank; }
-
-  /**
-   * @brief Get root rank of NCCL clique
-   */
-  inline int get_root_rank() const { return this->root_rank_; }
-
-  /**
-   * @brief Get number of ranks in NCCL clique
-   */
-  inline int get_num_ranks() const { return this->device_ids_.size(); }
-
-  /**
-   * @brief Get device ID of rank in NCCL clique
-   */
-  inline int get_device_id(int rank) const { return this->device_ids_[rank]; }
-
-  /**
-   * @brief Get NCCL comm object of rank in NCCL clique
-   */
-  inline ncclComm_t get_nccl_comm(int rank) const { return this->nccl_comms_[rank]; }
-
-  /**
-   * @brief Get raft::device_resources object of rank in NCCL clique
-   */
-  inline const raft::device_resources& get_device_resources(int rank) const
-  {
-    return this->device_resources_[rank];
-  }
-
-  /**
-   * @brief Set current device ID to root rank and return its raft::device_resources object
-   */
-  inline const raft::device_resources& set_current_device_to_root_rank() const
-  {
-    return set_current_device_to_rank(get_root_rank());
-  }
-
-  /**
-   * @brief Set current device ID to rank and return its raft::device_resources object
-   */
-  inline const raft::device_resources& set_current_device_to_rank(int rank) const
-  {
-    RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
-    return get_device_resources(rank);
-  }
-
-  /**
-   * @brief Set a memory pool on all GPUs of the clique
-   */
-  void set_memory_pool(int percent_of_free_memory) const
-  {
-    for (int rank = 0; rank < get_num_ranks(); rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
-      size_t limit =
-        rmm::percent_of_free_device_memory(percent_of_free_memory);  // check limit for each device
-      raft::resource::set_workspace_to_pool_resource(get_device_resources(rank), limit);
-    }
-    cudaSetDevice(this->main_gpu_id_);
-  }
-
-  bool has_resource_factory(resource::resource_type resource_type) const override
-  {
-    cudaSetDevice(this->main_gpu_id_);
-    return raft::resources::has_resource_factory(resource_type);
-  }
-
-  /** Destroys all held-up resources */
-  ~device_resources_snmg()
-  {
-#pragma omp parallel for  // necessary to avoid hangs
-    for (int rank = 0; rank < get_num_ranks(); rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
-      RAFT_NCCL_TRY(ncclCommDestroy(get_nccl_comm(rank)));
-    }
-    cudaSetDevice(this->main_gpu_id_);
-  }
-
- private:
-  /**
-   * @brief Initializes the NCCL clique and raft::device_resources objects
-   */
-  void initialize()
-  {
-    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), get_num_ranks(), device_ids_.data()));
-
-    for (int rank = 0; rank < get_num_ranks(); rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
-      device_resources_.emplace_back();
-
-      // ideally add the ncclComm_t to the device_resources object with
-      // raft::comms::build_comms_nccl_only
-    }
-    cudaSetDevice(this->main_gpu_id_);
-  }
-
-  int root_rank_;
-  int main_gpu_id_;
-  std::vector<int> device_ids_;
-  std::vector<ncclComm_t> nccl_comms_;
-  std::vector<raft::device_resources> device_resources_;
-
-};  // class device_resources_snmg
-
-}  // namespace raft
diff --git a/cpp/include/raft/core/resource/nccl_clique.hpp b/cpp/include/raft/core/resource/nccl_clique.hpp
new file mode 100644
index 0000000000..edda5043ae
--- /dev/null
+++ b/cpp/include/raft/core/resource/nccl_clique.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/comms/nccl_clique.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+
+namespace raft::resource {
+
+class nccl_clique_resource : public resource {
+ public:
+  nccl_clique_resource() : clique_(std::make_unique<raft::comms::nccl_clique>()) {}
+  ~nccl_clique_resource() override {}
+  void* get_resource() override { return clique_.get(); }
+
+ private:
+  std::unique_ptr<raft::comms::nccl_clique> clique_;
+};
+
+/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
+class nccl_clique_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::NCCL_CLIQUE; }
+  resource* make_resource() override { return new nccl_clique_resource(); }
+};
+
+/**
+ * @defgroup nccl_clique_resource resource functions
+ * @{
+ */
+
+/**
+ * Retrieves a NCCL clique from raft res if it exists, otherwise initializes it and return it.
+ *
+ * @param[in] res the raft resources object
+ * @return NCCL clique
+ */
+inline const raft::comms::nccl_clique& get_nccl_clique(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::NCCL_CLIQUE)) {
+    res.add_resource_factory(std::make_shared<nccl_clique_resource_factory>());
+  }
+  return *res.get_resource<raft::comms::nccl_clique>(resource_type::NCCL_CLIQUE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
index 44525edb23..b0827d8e11 100644
--- a/cpp/include/raft/core/resources.hpp
+++ b/cpp/include/raft/core/resources.hpp
@@ -72,7 +72,6 @@ class resources {
   resources(const resources& res) : factories_(res.factories_), resources_(res.resources_) {}
   resources(resources&&)            = delete;
   resources& operator=(resources&&) = delete;
-  virtual ~resources() {}
 
   /**
    * @brief Returns true if a resource_factory has been registered for the
@@ -80,7 +79,7 @@ class resources {
    * @param resource_type resource type to check
    * @return true if resource_factory is registered for the given resource_type
    */
-  virtual bool has_resource_factory(resource::resource_type resource_type) const
+  bool has_resource_factory(resource::resource_type resource_type) const
   {
     std::lock_guard<std::mutex> _(mutex_);
     return factories_.at(resource_type).first != resource::resource_type::LAST_KEY;
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
index 3c242af848..0da11acae6 100644
--- a/docs/source/cpp_api/core_resources.rst
+++ b/docs/source/cpp_api/core_resources.rst
@@ -55,23 +55,6 @@ namespace *raft::core*
     :project: RAFT
     :members:
 
-SNMG Device Resources
----------------------
-
-The `raft::device_resources_snmg` provides a convenient way to design SNMG
-(single-node multi-GPU) algorithms. It initiates device-related resources
-for a set of devices forming clique. This includes NCCL communications.
-GPUs can be addressed and exchanges be made over multiple threads
-for performance or convenience.
-
-``#include <raft/core/device_resources_snmg.hpp>``
-
-namespace *raft::core*
-
-.. doxygenclass:: raft::device_resources_snmg
-    :project: RAFT
-    :members:
-
 Resource Functions
 ------------------
 

From 501c8ce3b7b0ff56792921ced23ab140c6cea677 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Jan 2025 14:11:52 -0600
Subject: [PATCH 25/37] Use GCC 13 in CUDA 12 conda builds. (#2539)

conda-forge is using GCC 13 for CUDA 12 builds. This PR updates CUDA 12 conda builds to use GCC 13, for alignment.

These PRs should be merged in a specific order, see https://github.com/rapidsai/build-planning/issues/129 for details.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2539
---
 .../all_cuda-118_arch-aarch64.yaml            |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-aarch64.yaml            |  4 +--
 .../all_cuda-125_arch-x86_64.yaml             |  4 +--
 conda/recipes/libraft/conda_build_config.yaml | 14 +++++-----
 conda/recipes/libraft/meta.yaml               | 28 ++++++-------------
 .../recipes/pylibraft/conda_build_config.yaml | 14 +++++-----
 conda/recipes/pylibraft/meta.yaml             |  6 ++--
 .../recipes/raft-dask/conda_build_config.yaml | 14 +++++-----
 conda/recipes/raft-dask/meta.yaml             |  6 ++--
 cpp/test/label/label.cu                       |  4 +--
 dependencies.yaml                             | 18 ++++++++++--
 12 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 793ca8dc67..ecd9aa1ece 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -55,6 +55,6 @@ dependencies:
 - spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a9f839bd03..2f655ae077 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,6 +55,6 @@ dependencies:
 - spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 9d7286bb8e..d790e985fa 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - dask-cuda==25.2.*,>=0.0.0a0
 - distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
-- gcc_linux-aarch64=11.*
+- gcc_linux-aarch64=13.*
 - graphviz
 - ipython
 - libcublas-dev
@@ -51,6 +51,6 @@ dependencies:
 - spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index e4ec074ae5..63808d99c0 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - dask-cuda==25.2.*,>=0.0.0a0
 - distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
-- gcc_linux-64=11.*
+- gcc_linux-64=13.*
 - graphviz
 - ipython
 - libcublas-dev
@@ -51,6 +51,6 @@ dependencies:
 - spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-125_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 4857f12cd1..11b16bc2a8 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 503c4cb6fb..dbde4e3971 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -39,10 +39,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         {% endif %}
         - librmm
@@ -51,7 +49,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -85,11 +83,7 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
-        {% endif %}
         - librmm
     requirements:
       host:
@@ -130,10 +124,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -145,7 +137,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -196,10 +188,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         {% endif %}
     requirements:
@@ -207,7 +197,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -258,10 +248,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -273,7 +261,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
index 001878ff25..83f5ebcb15 100644
--- a/conda/recipes/pylibraft/conda_build_config.yaml
+++ b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 0b57432402..8f498c7e50 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -18,10 +18,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
     - cuda-python
@@ -31,7 +29,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 68140e6bc0..d567266027 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 ucx_py_version:
   - "0.42.*"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 19155166af..29c7f568f1 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -18,10 +18,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
     - cuda-python
@@ -31,7 +29,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index 4c3479182f..34a336de59 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -59,8 +59,8 @@ TEST_F(MakeMonotonicTest, Result)
 
   ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare<bool>(), stream));
 
-  delete data_h;
-  delete expected_h;
+  delete[] data_h;
+  delete[] expected_h;
 }
 
 TEST(labelTest, Classlabels)
diff --git a/dependencies.yaml b/dependencies.yaml
index 44c240b6ce..a2d75fd3d6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -184,14 +184,28 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
+              cuda: "11.8"
             packages:
               - gcc_linux-64=11.*
-              - sysroot_linux-64==2.17
+              - sysroot_linux-64==2.28
           - matrix:
               arch: aarch64
+              cuda: "11.8"
             packages:
               - gcc_linux-aarch64=11.*
-              - sysroot_linux-aarch64==2.17
+              - sysroot_linux-aarch64==2.28
+          - matrix:
+              arch: x86_64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-64=13.*
+              - sysroot_linux-64==2.28
+          - matrix:
+              arch: aarch64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-aarch64=13.*
+              - sysroot_linux-aarch64==2.28
       - output_types: conda
         matrices:
           - matrix: {cuda: "12.*"}

From 596d4b7338e62a92652503cd76feaeaa187ad740 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 21 Jan 2025 23:55:05 -0600
Subject: [PATCH 26/37] use dynamic CUDA wheels on CUDA 11 (#2548)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Contributes to https://github.com/rapidsai/build-planning/issues/137

Follow-up to #2531 .

See the linked issue for many more details, but in short... using a dynamically-loaded libraft which has statically-linked cuBLAS causes issues for other libraries.

There are now aarch64 CUDA 11 wheels for cuBLAS and other CUDA libraries, so it's possible to have RAFT wheels dynamically link against them. This PR does that.

## Notes for Reviewers

This has other side benefits in addition to fixing runtime issues... it also simplifies the wheel-building scripts and CMake, and makes CUDA 11 wheels noticeably smaller 😊

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2548
---
 ci/build_wheel.sh             | 27 +++++++++------------------
 ci/build_wheel_libraft.sh     | 11 -----------
 ci/validate_wheel.sh          | 15 ---------------
 dependencies.yaml             |  5 ++++-
 python/libraft/CMakeLists.txt | 31 +++++++++++--------------------
 python/libraft/pyproject.toml |  4 +++-
 6 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 4c295c416e..976da98998 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -21,24 +21,15 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXCLUDE_ARGS=(
-      --exclude "libcublas.so.12"
-      --exclude "libcublasLt.so.12"
-      --exclude "libcurand.so.10"
-      --exclude "libcusolver.so.11"
-      --exclude "libcusparse.so.12"
-      --exclude "libnvJitLink.so.12"
-      --exclude "libucp.so.0"
-    )
-  ;;
-  11.*)
-    EXCLUDE_ARGS=(
-      --exclude "libucp.so.0"
-    )
-  ;;
-esac
+EXCLUDE_ARGS=(
+  --exclude "libcublas.so.*"
+  --exclude "libcublasLt.so.*"
+  --exclude "libcurand.so.*"
+  --exclude "libcusolver.so.*"
+  --exclude "libcusparse.so.*"
+  --exclude "libnvJitLink.so.*"
+  --exclude "libucp.so.*"
+)
 
 if [[ ${package_name} != "libraft" ]]; then
     EXCLUDE_ARGS+=(
diff --git a/ci/build_wheel_libraft.sh b/ci/build_wheel_libraft.sh
index 825a5124a8..8ff0da1e9a 100755
--- a/ci/build_wheel_libraft.sh
+++ b/ci/build_wheel_libraft.sh
@@ -28,16 +28,5 @@ export PIP_NO_BUILD_ISOLATION=0
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXTRA_CMAKE_ARGS="-DUSE_CUDA_MATH_WHEELS=ON"
-  ;;
-  11.*)
-    EXTRA_CMAKE_ARGS="-DUSE_CUDA_MATH_WHEELS=OFF"
-  ;;
-esac
-
-export SKBUILD_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-
 ci/build_wheel.sh libraft ${package_dir} cpp
 ci/validate_wheel.sh ${package_dir} final_dist libraft
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index ca506af004..ec3867aa30 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -9,27 +9,12 @@ package_name=$3
 
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
-# some packages are much larger on CUDA 11 than on CUDA 12
-PYDISTCHECK_ARGS=()
-if [[ "${package_name}" == "libraft" ]]; then
-    if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
-        PYDISTCHECK_ARGS+=(
-            --max-allowed-size-compressed '750M'
-        )
-    else
-        PYDISTCHECK_ARGS+=(
-            --max-allowed-size-compressed '100M'
-        )
-    fi
-fi
-
 cd "${package_dir}"
 
 rapids-logger "validate packages with 'pydistcheck'"
 
 pydistcheck \
     --inspect \
-    "${PYDISTCHECK_ARGS[@]}" \
     "$(echo ${wheel_dir_relative_path}/*.whl)"
 
 rapids-logger "validate packages with 'twine'"
diff --git a/dependencies.yaml b/dependencies.yaml
index a2d75fd3d6..b7a0344b1a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -358,11 +358,14 @@ dependencies:
               - nvidia-curand-cu12
               - nvidia-cusolver-cu12
               - nvidia-cusparse-cu12
-          # CUDA 11 does not provide wheels, so use the system libraries instead
           - matrix:
               cuda: "11.*"
               use_cuda_wheels: "true"
             packages:
+              - nvidia-cublas-cu11
+              - nvidia-curand-cu11
+              - nvidia-cusolver-cu11
+              - nvidia-cusparse-cu11
           # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
           # (e.g. for DLFW and pip devcontainers)
           - matrix:
diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt
index 57efcd61ab..db81aa9507 100644
--- a/python/libraft/CMakeLists.txt
+++ b/python/libraft/CMakeLists.txt
@@ -22,8 +22,6 @@ project(
   LANGUAGES CXX
 )
 
-option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
-
 # Check if raft is already available. If so, it is the user's responsibility to ensure that the
 # CMake package is also available at build time of the Python raft package.
 find_package(raft "${RAPIDS_VERSION}")
@@ -35,14 +33,8 @@ endif()
 unset(raft_FOUND)
 
 # --- CUDA --- #
-find_package(CUDAToolkit REQUIRED)
 set(CUDA_STATIC_RUNTIME ON)
-set(CUDA_STATIC_MATH_LIBRARIES ON)
-if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
-  set(CUDA_STATIC_MATH_LIBRARIES OFF)
-elseif(USE_CUDA_MATH_WHEELS)
-  message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
-endif()
+set(CUDA_STATIC_MATH_LIBRARIES OFF)
 
 # --- RAFT ---#
 set(BUILD_TESTS OFF)
@@ -52,14 +44,13 @@ set(RAFT_COMPILE_LIBRARY ON)
 
 add_subdirectory(../../cpp raft-cpp)
 
-if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
-  set_property(
-    TARGET raft_lib
-    PROPERTY INSTALL_RPATH
-             "$ORIGIN/../nvidia/cublas/lib"
-             "$ORIGIN/../nvidia/curand/lib"
-             "$ORIGIN/../nvidia/cusolver/lib"
-             "$ORIGIN/../nvidia/cusparse/lib"
-             "$ORIGIN/../nvidia/nvjitlink/lib"
-  )
-endif()
+# assumes libraft.so is installed 2 levels deep, e.g. site-packages/libraft/lib64/libraft.so
+set_property(
+  TARGET raft_lib
+  PROPERTY INSTALL_RPATH
+           "$ORIGIN/../../nvidia/cublas/lib"
+           "$ORIGIN/../../nvidia/curand/lib"
+           "$ORIGIN/../../nvidia/cusolver/lib"
+           "$ORIGIN/../../nvidia/cusparse/lib"
+           "$ORIGIN/../../nvidia/nvjitlink/lib"
+)
diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml
index 549a1bf651..89b2834614 100644
--- a/python/libraft/pyproject.toml
+++ b/python/libraft/pyproject.toml
@@ -110,6 +110,8 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 
 [tool.pydistcheck]
 select = [
-    # NOTE: size threshold is managed via CLI args in CI scripts
     "distro-too-large-compressed",
 ]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'

From e5b657d96c692bedbf12e895fccb2ca3732c9897 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 22 Jan 2025 11:05:10 -0600
Subject: [PATCH 27/37] Use cuda.bindings layout. (#2545)

This PR updates RAFT to use the new cuda-python `cuda.bindings` layout. See https://github.com/rapidsai/build-planning/issues/117.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/2545
---
 python/pylibraft/pylibraft/common/cuda.pxd          | 2 +-
 python/pylibraft/pylibraft/common/cuda.pyx          | 2 +-
 python/pylibraft/pylibraft/common/handle.pyx        | 2 +-
 python/pylibraft/pylibraft/common/interruptible.pyx | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/cuda.pxd b/python/pylibraft/pylibraft/common/cuda.pxd
index a44d9aeb63..934573b51f 100644
--- a/python/pylibraft/pylibraft/common/cuda.pxd
+++ b/python/pylibraft/pylibraft/common/cuda.pxd
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 cdef class Stream:
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
index c164a463ae..cda0fc7168 100644
--- a/python/pylibraft/pylibraft/common/cuda.pyx
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -19,7 +19,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cuda.ccudart cimport (
+from cuda.bindings.cyruntime cimport (
     cudaError_t,
     cudaGetErrorName,
     cudaGetErrorString,
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index d256e671bf..400b667789 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -21,7 +21,7 @@
 
 import functools
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 
 from rmm.librmm.cuda_stream_view cimport (
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
index c489f2ee20..ceac387f58 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pyx
+++ b/python/pylibraft/pylibraft/common/interruptible.pyx
@@ -22,7 +22,7 @@
 import contextlib
 import signal
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 from cython.operator cimport dereference
 
 from rmm.librmm.cuda_stream_view cimport cuda_stream_view

From 0eff2358ce97717779854d660ed10bdc921ded03 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 23 Jan 2025 16:40:08 -0600
Subject: [PATCH 28/37] Rename test to tests. (#2546)

Renames `test` directories to `tests` for alignment with the rest of RAPIDS.

See also: https://github.com/rapidsai/cuvs/issues/587

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/raft/pull/2546
---
 .pre-commit-config.yaml                              |  2 +-
 ci/run_pylibraft_pytests.sh                          |  2 +-
 ci/run_raft_dask_pytests.sh                          |  2 +-
 ci/test_wheel_pylibraft.sh                           |  2 +-
 ci/test_wheel_raft_dask.sh                           |  2 +-
 cpp/CMakeLists.txt                                   |  2 +-
 cpp/{test => tests}/CMakeLists.txt                   |  2 +-
 cpp/{test => tests}/core/bitmap.cu                   |  0
 cpp/{test => tests}/core/bitset.cu                   |  0
 .../core/device_resources_manager.cpp                |  0
 cpp/{test => tests}/core/device_setter.cpp           |  0
 cpp/{test => tests}/core/handle.cpp                  |  0
 cpp/{test => tests}/core/interruptible.cu            |  0
 cpp/{test => tests}/core/logger.cpp                  |  0
 cpp/{test => tests}/core/math_device.cu              |  0
 cpp/{test => tests}/core/math_host.cpp               |  0
 cpp/{test => tests}/core/mdarray.cu                  |  0
 cpp/{test => tests}/core/mdbuffer.cu                 |  0
 cpp/{test => tests}/core/mdspan_copy.cpp             |  0
 cpp/{test => tests}/core/mdspan_copy.cu              |  0
 cpp/{test => tests}/core/mdspan_utils.cu             |  0
 cpp/{test => tests}/core/memory_type.cpp             |  0
 cpp/{test => tests}/core/numpy_serializer.cu         |  0
 cpp/{test => tests}/core/nvtx.cpp                    |  0
 cpp/{test => tests}/core/operators_device.cu         |  0
 cpp/{test => tests}/core/operators_host.cpp          |  0
 cpp/{test => tests}/core/seive.cu                    |  0
 cpp/{test => tests}/core/span.cpp                    |  0
 cpp/{test => tests}/core/span.cu                     |  0
 cpp/{test => tests}/core/sparse_matrix.cpp           |  0
 cpp/{test => tests}/core/sparse_matrix.cu            |  0
 cpp/{test => tests}/core/stream_view.cpp             |  0
 cpp/{test => tests}/core/temporary_device_buffer.cu  |  0
 cpp/{test => tests}/core/test_span.hpp               |  0
 cpp/{test => tests}/ext_headers/00_generate.py       |  0
 cpp/{test => tests}/ext_headers/raft_core_logger.cpp |  0
 .../raft_distance_detail_pairwise_matrix_dispatch.cu |  0
 .../ext_headers/raft_distance_distance.cu            |  0
 .../ext_headers/raft_distance_fused_l2_nn.cu         |  0
 .../raft_linalg_detail_coalesced_reduction.cu        |  0
 .../ext_headers/raft_matrix_detail_select_k.cu       |  0
 .../ext_headers/raft_neighbors_ball_cover.cu         |  0
 .../ext_headers/raft_neighbors_brute_force.cu        |  0
 ...aft_neighbors_detail_ivf_flat_interleaved_scan.cu |  0
 .../raft_neighbors_detail_ivf_flat_search.cu         |  0
 ...aft_neighbors_detail_ivf_pq_compute_similarity.cu |  0
 .../ext_headers/raft_neighbors_ivf_flat.cu           |  0
 .../ext_headers/raft_neighbors_ivf_pq.cu             |  0
 .../ext_headers/raft_neighbors_refine.cu             |  0
 .../raft_sparse_matrix_detail_select_k.cu            |  0
 .../raft_spatial_knn_detail_ball_cover_registers.cu  |  0
 .../raft_spatial_knn_detail_fused_l2_knn.cu          |  0
 cpp/{test => tests}/label/label.cu                   |  0
 cpp/{test => tests}/label/merge_labels.cu            |  0
 cpp/{test => tests}/lap/lap.cu                       |  0
 cpp/{test => tests}/linalg/add.cu                    |  0
 cpp/{test => tests}/linalg/add.cuh                   |  0
 cpp/{test => tests}/linalg/axpy.cu                   |  0
 cpp/{test => tests}/linalg/binary_op.cu              |  0
 cpp/{test => tests}/linalg/binary_op.cuh             |  0
 cpp/{test => tests}/linalg/cholesky_r1.cu            |  0
 cpp/{test => tests}/linalg/coalesced_reduction.cu    |  0
 cpp/{test => tests}/linalg/divide.cu                 |  0
 cpp/{test => tests}/linalg/dot.cu                    |  0
 cpp/{test => tests}/linalg/eig.cu                    |  0
 cpp/{test => tests}/linalg/eig_sel.cu                |  0
 cpp/{test => tests}/linalg/eigen_solvers.cu          |  0
 cpp/{test => tests}/linalg/eltwise.cu                |  0
 cpp/{test => tests}/linalg/gemm_layout.cu            |  0
 cpp/{test => tests}/linalg/gemv.cu                   |  0
 cpp/{test => tests}/linalg/map.cu                    |  0
 cpp/{test => tests}/linalg/map_then_reduce.cu        |  0
 cpp/{test => tests}/linalg/matrix_vector.cu          |  0
 cpp/{test => tests}/linalg/matrix_vector_op.cu       |  0
 cpp/{test => tests}/linalg/matrix_vector_op.cuh      |  0
 cpp/{test => tests}/linalg/mean_squared_error.cu     |  0
 cpp/{test => tests}/linalg/multiply.cu               |  0
 cpp/{test => tests}/linalg/norm.cu                   |  0
 cpp/{test => tests}/linalg/normalize.cu              |  0
 cpp/{test => tests}/linalg/power.cu                  |  0
 cpp/{test => tests}/linalg/randomized_svd.cu         |  0
 cpp/{test => tests}/linalg/reduce.cu                 |  0
 cpp/{test => tests}/linalg/reduce.cuh                |  0
 cpp/{test => tests}/linalg/reduce_cols_by_key.cu     |  0
 cpp/{test => tests}/linalg/reduce_rows_by_key.cu     |  0
 cpp/{test => tests}/linalg/rsvd.cu                   |  0
 cpp/{test => tests}/linalg/sqrt.cu                   |  0
 cpp/{test => tests}/linalg/strided_reduction.cu      |  0
 cpp/{test => tests}/linalg/subtract.cu               |  0
 cpp/{test => tests}/linalg/svd.cu                    |  0
 cpp/{test => tests}/linalg/ternary_op.cu             |  0
 cpp/{test => tests}/linalg/transpose.cu              |  0
 cpp/{test => tests}/linalg/unary_op.cu               |  0
 cpp/{test => tests}/linalg/unary_op.cuh              |  0
 cpp/{test => tests}/matrix/argmax.cu                 |  0
 cpp/{test => tests}/matrix/argmin.cu                 |  0
 cpp/{test => tests}/matrix/columnSort.cu             |  0
 cpp/{test => tests}/matrix/diagonal.cu               |  0
 cpp/{test => tests}/matrix/eye.cu                    |  0
 cpp/{test => tests}/matrix/gather.cu                 |  0
 cpp/{test => tests}/matrix/linewise_op.cu            |  0
 cpp/{test => tests}/matrix/math.cu                   |  0
 cpp/{test => tests}/matrix/matrix.cu                 |  0
 cpp/{test => tests}/matrix/norm.cu                   |  0
 cpp/{test => tests}/matrix/reverse.cu                |  0
 cpp/{test => tests}/matrix/sample_rows.cu            |  0
 cpp/{test => tests}/matrix/scatter.cu                |  0
 cpp/{test => tests}/matrix/select_k.cu               |  0
 cpp/{test => tests}/matrix/select_k.cuh              |  0
 cpp/{test => tests}/matrix/select_large_k.cu         |  0
 cpp/{test => tests}/matrix/slice.cu                  |  0
 cpp/{test => tests}/matrix/triangular.cu             |  0
 cpp/{test => tests}/mr/device/buffer.cpp             |  0
 cpp/{test => tests}/mr/host/buffer.cpp               |  0
 cpp/{test => tests}/neighbors/ball_cover.cu          |  0
 .../neighbors/epsilon_neighborhood.cu                |  0
 cpp/{test => tests}/neighbors/haversine.cu           |  0
 cpp/{test => tests}/neighbors/knn_utils.cuh          |  0
 cpp/{test => tests}/neighbors/spatial_data.h         |  0
 cpp/{test => tests}/random/excess_sampling.cu        |  0
 cpp/{test => tests}/random/make_blobs.cu             |  0
 cpp/{test => tests}/random/make_regression.cu        |  0
 .../random/multi_variable_gaussian.cu                |  0
 cpp/{test => tests}/random/permute.cu                |  0
 .../random/rmat_rectangular_generator.cu             |  0
 cpp/{test => tests}/random/rng.cu                    |  0
 cpp/{test => tests}/random/rng_discrete.cu           |  0
 cpp/{test => tests}/random/rng_int.cu                |  0
 cpp/{test => tests}/random/rng_pcg_host_api.cu       |  0
 .../random/sample_without_replacement.cu             |  0
 cpp/{test => tests}/sparse/add.cu                    |  0
 cpp/{test => tests}/sparse/convert_coo.cu            |  0
 cpp/{test => tests}/sparse/convert_csr.cu            |  0
 cpp/{test => tests}/sparse/csr_row_slice.cu          |  0
 cpp/{test => tests}/sparse/csr_to_dense.cu           |  0
 cpp/{test => tests}/sparse/csr_transpose.cu          |  0
 cpp/{test => tests}/sparse/degree.cu                 |  0
 cpp/{test => tests}/sparse/dist_coo_spmv.cu          |  0
 cpp/{test => tests}/sparse/distance.cu               |  0
 cpp/{test => tests}/sparse/filter.cu                 |  0
 cpp/{test => tests}/sparse/masked_matmul.cu          |  0
 cpp/{test => tests}/sparse/mst.cu                    |  0
 cpp/{test => tests}/sparse/norm.cu                   |  0
 cpp/{test => tests}/sparse/normalize.cu              |  0
 cpp/{test => tests}/sparse/reduce.cu                 |  0
 cpp/{test => tests}/sparse/row_op.cu                 |  0
 cpp/{test => tests}/sparse/sddmm.cu                  |  0
 cpp/{test => tests}/sparse/select_k_csr.cu           |  0
 cpp/{test => tests}/sparse/solver/lanczos.cu         |  0
 cpp/{test => tests}/sparse/sort.cu                   |  0
 cpp/{test => tests}/sparse/spectral_matrix.cu        |  0
 cpp/{test => tests}/sparse/spgemmi.cu                |  0
 cpp/{test => tests}/sparse/spmm.cu                   |  0
 cpp/{test => tests}/sparse/symmetrize.cu             |  0
 cpp/{test => tests}/stats/accuracy.cu                |  0
 cpp/{test => tests}/stats/adjusted_rand_index.cu     |  0
 cpp/{test => tests}/stats/completeness_score.cu      |  0
 cpp/{test => tests}/stats/contingencyMatrix.cu       |  0
 cpp/{test => tests}/stats/cov.cu                     |  0
 cpp/{test => tests}/stats/dispersion.cu              |  0
 cpp/{test => tests}/stats/entropy.cu                 |  0
 cpp/{test => tests}/stats/histogram.cu               |  0
 cpp/{test => tests}/stats/homogeneity_score.cu       |  0
 cpp/{test => tests}/stats/information_criterion.cu   |  0
 cpp/{test => tests}/stats/kl_divergence.cu           |  0
 cpp/{test => tests}/stats/mean.cu                    |  0
 cpp/{test => tests}/stats/mean_center.cu             |  0
 cpp/{test => tests}/stats/meanvar.cu                 |  0
 cpp/{test => tests}/stats/minmax.cu                  |  0
 cpp/{test => tests}/stats/mutual_info_score.cu       |  0
 cpp/{test => tests}/stats/r2_score.cu                |  0
 cpp/{test => tests}/stats/rand_index.cu              |  0
 cpp/{test => tests}/stats/regression_metrics.cu      |  0
 cpp/{test => tests}/stats/stddev.cu                  |  0
 cpp/{test => tests}/stats/sum.cu                     |  0
 cpp/{test => tests}/stats/v_measure.cu               |  0
 cpp/{test => tests}/stats/weighted_mean.cu           |  0
 cpp/{test => tests}/test.cpp                         |  0
 cpp/{test => tests}/test_utils.cuh                   |  0
 cpp/{test => tests}/test_utils.h                     |  0
 cpp/{test => tests}/util/bitonic_sort.cu             |  0
 cpp/{test => tests}/util/cudart_utils.cpp            |  0
 cpp/{test => tests}/util/device_atomics.cu           |  0
 cpp/{test => tests}/util/integer_utils.cpp           |  0
 cpp/{test => tests}/util/integer_utils.cu            |  0
 cpp/{test => tests}/util/memory_type_dispatcher.cu   |  0
 cpp/{test => tests}/util/popc.cu                     |  0
 cpp/{test => tests}/util/pow2_utils.cu               |  0
 cpp/{test => tests}/util/reduction.cu                |  0
 docs/source/developer_guide.md                       | 12 ++++++------
 pyproject.toml                                       |  2 +-
 .../pylibraft/pylibraft/{test => tests}/__init__py   |  0
 .../pylibraft/pylibraft/{test => tests}/pytest.ini   |  0
 .../pylibraft/{test => tests}/test_cai_wrapper.py    |  0
 .../pylibraft/{test => tests}/test_config.py         |  0
 .../pylibraft/{test => tests}/test_device_ndarray.py |  0
 .../pylibraft/{test => tests}/test_doctests.py       |  0
 .../pylibraft/{test => tests}/test_handle.py         |  0
 .../{test => tests}/test_mdspan_serializer.py        |  0
 .../pylibraft/{test => tests}/test_random.py         |  0
 .../pylibraft/{test => tests}/test_sparse.py         |  0
 .../pylibraft/{test => tests}/test_version.py        |  0
 .../{test => tests}/test_z_interruptible.py          |  0
 .../raft-dask/raft_dask/{test => tests}/conftest.py  |  0
 .../raft-dask/raft_dask/{test => tests}/pytest.ini   |  0
 .../raft_dask/{test => tests}/test_comms.py          |  0
 .../raft-dask/raft_dask/{test => tests}/test_raft.py |  0
 .../raft_dask/{test => tests}/test_version.py        |  0
 208 files changed, 14 insertions(+), 14 deletions(-)
 rename cpp/{test => tests}/CMakeLists.txt (99%)
 rename cpp/{test => tests}/core/bitmap.cu (100%)
 rename cpp/{test => tests}/core/bitset.cu (100%)
 rename cpp/{test => tests}/core/device_resources_manager.cpp (100%)
 rename cpp/{test => tests}/core/device_setter.cpp (100%)
 rename cpp/{test => tests}/core/handle.cpp (100%)
 rename cpp/{test => tests}/core/interruptible.cu (100%)
 rename cpp/{test => tests}/core/logger.cpp (100%)
 rename cpp/{test => tests}/core/math_device.cu (100%)
 rename cpp/{test => tests}/core/math_host.cpp (100%)
 rename cpp/{test => tests}/core/mdarray.cu (100%)
 rename cpp/{test => tests}/core/mdbuffer.cu (100%)
 rename cpp/{test => tests}/core/mdspan_copy.cpp (100%)
 rename cpp/{test => tests}/core/mdspan_copy.cu (100%)
 rename cpp/{test => tests}/core/mdspan_utils.cu (100%)
 rename cpp/{test => tests}/core/memory_type.cpp (100%)
 rename cpp/{test => tests}/core/numpy_serializer.cu (100%)
 rename cpp/{test => tests}/core/nvtx.cpp (100%)
 rename cpp/{test => tests}/core/operators_device.cu (100%)
 rename cpp/{test => tests}/core/operators_host.cpp (100%)
 rename cpp/{test => tests}/core/seive.cu (100%)
 rename cpp/{test => tests}/core/span.cpp (100%)
 rename cpp/{test => tests}/core/span.cu (100%)
 rename cpp/{test => tests}/core/sparse_matrix.cpp (100%)
 rename cpp/{test => tests}/core/sparse_matrix.cu (100%)
 rename cpp/{test => tests}/core/stream_view.cpp (100%)
 rename cpp/{test => tests}/core/temporary_device_buffer.cu (100%)
 rename cpp/{test => tests}/core/test_span.hpp (100%)
 rename cpp/{test => tests}/ext_headers/00_generate.py (100%)
 rename cpp/{test => tests}/ext_headers/raft_core_logger.cpp (100%)
 rename cpp/{test => tests}/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_distance_distance.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_distance_fused_l2_nn.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_linalg_detail_coalesced_reduction.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_matrix_detail_select_k.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_ball_cover.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_brute_force.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_detail_ivf_flat_search.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_ivf_flat.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_ivf_pq.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_neighbors_refine.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_sparse_matrix_detail_select_k.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu (100%)
 rename cpp/{test => tests}/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu (100%)
 rename cpp/{test => tests}/label/label.cu (100%)
 rename cpp/{test => tests}/label/merge_labels.cu (100%)
 rename cpp/{test => tests}/lap/lap.cu (100%)
 rename cpp/{test => tests}/linalg/add.cu (100%)
 rename cpp/{test => tests}/linalg/add.cuh (100%)
 rename cpp/{test => tests}/linalg/axpy.cu (100%)
 rename cpp/{test => tests}/linalg/binary_op.cu (100%)
 rename cpp/{test => tests}/linalg/binary_op.cuh (100%)
 rename cpp/{test => tests}/linalg/cholesky_r1.cu (100%)
 rename cpp/{test => tests}/linalg/coalesced_reduction.cu (100%)
 rename cpp/{test => tests}/linalg/divide.cu (100%)
 rename cpp/{test => tests}/linalg/dot.cu (100%)
 rename cpp/{test => tests}/linalg/eig.cu (100%)
 rename cpp/{test => tests}/linalg/eig_sel.cu (100%)
 rename cpp/{test => tests}/linalg/eigen_solvers.cu (100%)
 rename cpp/{test => tests}/linalg/eltwise.cu (100%)
 rename cpp/{test => tests}/linalg/gemm_layout.cu (100%)
 rename cpp/{test => tests}/linalg/gemv.cu (100%)
 rename cpp/{test => tests}/linalg/map.cu (100%)
 rename cpp/{test => tests}/linalg/map_then_reduce.cu (100%)
 rename cpp/{test => tests}/linalg/matrix_vector.cu (100%)
 rename cpp/{test => tests}/linalg/matrix_vector_op.cu (100%)
 rename cpp/{test => tests}/linalg/matrix_vector_op.cuh (100%)
 rename cpp/{test => tests}/linalg/mean_squared_error.cu (100%)
 rename cpp/{test => tests}/linalg/multiply.cu (100%)
 rename cpp/{test => tests}/linalg/norm.cu (100%)
 rename cpp/{test => tests}/linalg/normalize.cu (100%)
 rename cpp/{test => tests}/linalg/power.cu (100%)
 rename cpp/{test => tests}/linalg/randomized_svd.cu (100%)
 rename cpp/{test => tests}/linalg/reduce.cu (100%)
 rename cpp/{test => tests}/linalg/reduce.cuh (100%)
 rename cpp/{test => tests}/linalg/reduce_cols_by_key.cu (100%)
 rename cpp/{test => tests}/linalg/reduce_rows_by_key.cu (100%)
 rename cpp/{test => tests}/linalg/rsvd.cu (100%)
 rename cpp/{test => tests}/linalg/sqrt.cu (100%)
 rename cpp/{test => tests}/linalg/strided_reduction.cu (100%)
 rename cpp/{test => tests}/linalg/subtract.cu (100%)
 rename cpp/{test => tests}/linalg/svd.cu (100%)
 rename cpp/{test => tests}/linalg/ternary_op.cu (100%)
 rename cpp/{test => tests}/linalg/transpose.cu (100%)
 rename cpp/{test => tests}/linalg/unary_op.cu (100%)
 rename cpp/{test => tests}/linalg/unary_op.cuh (100%)
 rename cpp/{test => tests}/matrix/argmax.cu (100%)
 rename cpp/{test => tests}/matrix/argmin.cu (100%)
 rename cpp/{test => tests}/matrix/columnSort.cu (100%)
 rename cpp/{test => tests}/matrix/diagonal.cu (100%)
 rename cpp/{test => tests}/matrix/eye.cu (100%)
 rename cpp/{test => tests}/matrix/gather.cu (100%)
 rename cpp/{test => tests}/matrix/linewise_op.cu (100%)
 rename cpp/{test => tests}/matrix/math.cu (100%)
 rename cpp/{test => tests}/matrix/matrix.cu (100%)
 rename cpp/{test => tests}/matrix/norm.cu (100%)
 rename cpp/{test => tests}/matrix/reverse.cu (100%)
 rename cpp/{test => tests}/matrix/sample_rows.cu (100%)
 rename cpp/{test => tests}/matrix/scatter.cu (100%)
 rename cpp/{test => tests}/matrix/select_k.cu (100%)
 rename cpp/{test => tests}/matrix/select_k.cuh (100%)
 rename cpp/{test => tests}/matrix/select_large_k.cu (100%)
 rename cpp/{test => tests}/matrix/slice.cu (100%)
 rename cpp/{test => tests}/matrix/triangular.cu (100%)
 rename cpp/{test => tests}/mr/device/buffer.cpp (100%)
 rename cpp/{test => tests}/mr/host/buffer.cpp (100%)
 rename cpp/{test => tests}/neighbors/ball_cover.cu (100%)
 rename cpp/{test => tests}/neighbors/epsilon_neighborhood.cu (100%)
 rename cpp/{test => tests}/neighbors/haversine.cu (100%)
 rename cpp/{test => tests}/neighbors/knn_utils.cuh (100%)
 rename cpp/{test => tests}/neighbors/spatial_data.h (100%)
 rename cpp/{test => tests}/random/excess_sampling.cu (100%)
 rename cpp/{test => tests}/random/make_blobs.cu (100%)
 rename cpp/{test => tests}/random/make_regression.cu (100%)
 rename cpp/{test => tests}/random/multi_variable_gaussian.cu (100%)
 rename cpp/{test => tests}/random/permute.cu (100%)
 rename cpp/{test => tests}/random/rmat_rectangular_generator.cu (100%)
 rename cpp/{test => tests}/random/rng.cu (100%)
 rename cpp/{test => tests}/random/rng_discrete.cu (100%)
 rename cpp/{test => tests}/random/rng_int.cu (100%)
 rename cpp/{test => tests}/random/rng_pcg_host_api.cu (100%)
 rename cpp/{test => tests}/random/sample_without_replacement.cu (100%)
 rename cpp/{test => tests}/sparse/add.cu (100%)
 rename cpp/{test => tests}/sparse/convert_coo.cu (100%)
 rename cpp/{test => tests}/sparse/convert_csr.cu (100%)
 rename cpp/{test => tests}/sparse/csr_row_slice.cu (100%)
 rename cpp/{test => tests}/sparse/csr_to_dense.cu (100%)
 rename cpp/{test => tests}/sparse/csr_transpose.cu (100%)
 rename cpp/{test => tests}/sparse/degree.cu (100%)
 rename cpp/{test => tests}/sparse/dist_coo_spmv.cu (100%)
 rename cpp/{test => tests}/sparse/distance.cu (100%)
 rename cpp/{test => tests}/sparse/filter.cu (100%)
 rename cpp/{test => tests}/sparse/masked_matmul.cu (100%)
 rename cpp/{test => tests}/sparse/mst.cu (100%)
 rename cpp/{test => tests}/sparse/norm.cu (100%)
 rename cpp/{test => tests}/sparse/normalize.cu (100%)
 rename cpp/{test => tests}/sparse/reduce.cu (100%)
 rename cpp/{test => tests}/sparse/row_op.cu (100%)
 rename cpp/{test => tests}/sparse/sddmm.cu (100%)
 rename cpp/{test => tests}/sparse/select_k_csr.cu (100%)
 rename cpp/{test => tests}/sparse/solver/lanczos.cu (100%)
 rename cpp/{test => tests}/sparse/sort.cu (100%)
 rename cpp/{test => tests}/sparse/spectral_matrix.cu (100%)
 rename cpp/{test => tests}/sparse/spgemmi.cu (100%)
 rename cpp/{test => tests}/sparse/spmm.cu (100%)
 rename cpp/{test => tests}/sparse/symmetrize.cu (100%)
 rename cpp/{test => tests}/stats/accuracy.cu (100%)
 rename cpp/{test => tests}/stats/adjusted_rand_index.cu (100%)
 rename cpp/{test => tests}/stats/completeness_score.cu (100%)
 rename cpp/{test => tests}/stats/contingencyMatrix.cu (100%)
 rename cpp/{test => tests}/stats/cov.cu (100%)
 rename cpp/{test => tests}/stats/dispersion.cu (100%)
 rename cpp/{test => tests}/stats/entropy.cu (100%)
 rename cpp/{test => tests}/stats/histogram.cu (100%)
 rename cpp/{test => tests}/stats/homogeneity_score.cu (100%)
 rename cpp/{test => tests}/stats/information_criterion.cu (100%)
 rename cpp/{test => tests}/stats/kl_divergence.cu (100%)
 rename cpp/{test => tests}/stats/mean.cu (100%)
 rename cpp/{test => tests}/stats/mean_center.cu (100%)
 rename cpp/{test => tests}/stats/meanvar.cu (100%)
 rename cpp/{test => tests}/stats/minmax.cu (100%)
 rename cpp/{test => tests}/stats/mutual_info_score.cu (100%)
 rename cpp/{test => tests}/stats/r2_score.cu (100%)
 rename cpp/{test => tests}/stats/rand_index.cu (100%)
 rename cpp/{test => tests}/stats/regression_metrics.cu (100%)
 rename cpp/{test => tests}/stats/stddev.cu (100%)
 rename cpp/{test => tests}/stats/sum.cu (100%)
 rename cpp/{test => tests}/stats/v_measure.cu (100%)
 rename cpp/{test => tests}/stats/weighted_mean.cu (100%)
 rename cpp/{test => tests}/test.cpp (100%)
 rename cpp/{test => tests}/test_utils.cuh (100%)
 rename cpp/{test => tests}/test_utils.h (100%)
 rename cpp/{test => tests}/util/bitonic_sort.cu (100%)
 rename cpp/{test => tests}/util/cudart_utils.cpp (100%)
 rename cpp/{test => tests}/util/device_atomics.cu (100%)
 rename cpp/{test => tests}/util/integer_utils.cpp (100%)
 rename cpp/{test => tests}/util/integer_utils.cu (100%)
 rename cpp/{test => tests}/util/memory_type_dispatcher.cu (100%)
 rename cpp/{test => tests}/util/popc.cu (100%)
 rename cpp/{test => tests}/util/pow2_utils.cu (100%)
 rename cpp/{test => tests}/util/reduction.cu (100%)
 rename python/pylibraft/pylibraft/{test => tests}/__init__py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/pytest.ini (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_cai_wrapper.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_config.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_device_ndarray.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_doctests.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_handle.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_mdspan_serializer.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_random.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_sparse.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_version.py (100%)
 rename python/pylibraft/pylibraft/{test => tests}/test_z_interruptible.py (100%)
 rename python/raft-dask/raft_dask/{test => tests}/conftest.py (100%)
 rename python/raft-dask/raft_dask/{test => tests}/pytest.ini (100%)
 rename python/raft-dask/raft_dask/{test => tests}/test_comms.py (100%)
 rename python/raft-dask/raft_dask/{test => tests}/test_raft.py (100%)
 rename python/raft-dask/raft_dask/{test => tests}/test_version.py (100%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d5456ba30b..ca1efc3abd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -83,7 +83,7 @@ repos:
                 exclude: .*/thirdparty/.*
               - id: include-check
                 name: include-check
-                entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/test
+                entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/tests
                 pass_filenames: false
                 language: python
                 additional_dependencies: [gitpython]
diff --git a/ci/run_pylibraft_pytests.sh b/ci/run_pylibraft_pytests.sh
index 1167b89c5f..7f3d1f9cfb 100755
--- a/ci/run_pylibraft_pytests.sh
+++ b/ci/run_pylibraft_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_pylibraft_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/pylibraft/pylibraft
 
-pytest --cache-clear "$@" test
+pytest --cache-clear "$@" tests
diff --git a/ci/run_raft_dask_pytests.sh b/ci/run_raft_dask_pytests.sh
index 07d0b5baa0..a9e6a130cd 100755
--- a/ci/run_raft_dask_pytests.sh
+++ b/ci/run_raft_dask_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_raft_dask_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/raft-dask/raft_dask
 
-pytest --cache-clear --import-mode=append "$@" test
+pytest --cache-clear --import-mode=append "$@" tests
diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index 1e0b34d609..26f4da267f 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -14,4 +14,4 @@ python -m pip install \
     ./local-libraft-dep/libraft*.whl \
     "$(echo ./dist/pylibraft*.whl)[test]"
 
-python -m pytest ./python/pylibraft/pylibraft/test
+python -m pytest ./python/pylibraft/pylibraft/tests
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index 011de4d409..c394314aac 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -15,7 +15,7 @@ python -m pip install -v \
     ./local-pylibraft-dep/pylibraft*.whl \
     "$(echo ./dist/raft_dask_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
-test_dir="python/raft-dask/raft_dask/test"
+test_dir="python/raft-dask/raft_dask/tests"
 
 rapids-logger "pytest raft-dask"
 python -m pytest --import-mode=append ${test_dir}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eb7e8540f0..c38471bebd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -586,7 +586,7 @@ endif()
 # * build test executable ----------------------------------------------------
 
 if(BUILD_TESTS)
-  add_subdirectory(test)
+  add_subdirectory(tests)
 endif()
 
 # ##################################################################################################
diff --git a/cpp/test/CMakeLists.txt b/cpp/tests/CMakeLists.txt
similarity index 99%
rename from cpp/test/CMakeLists.txt
rename to cpp/tests/CMakeLists.txt
index 4cd0a32f51..9f96b93e7a 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -77,7 +77,7 @@ function(ConfigureTest)
     target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA")
   endif()
 
-  target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
+  target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/tests>")
 
   rapids_test_add(
     NAME ${TEST_NAME}
diff --git a/cpp/test/core/bitmap.cu b/cpp/tests/core/bitmap.cu
similarity index 100%
rename from cpp/test/core/bitmap.cu
rename to cpp/tests/core/bitmap.cu
diff --git a/cpp/test/core/bitset.cu b/cpp/tests/core/bitset.cu
similarity index 100%
rename from cpp/test/core/bitset.cu
rename to cpp/tests/core/bitset.cu
diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/tests/core/device_resources_manager.cpp
similarity index 100%
rename from cpp/test/core/device_resources_manager.cpp
rename to cpp/tests/core/device_resources_manager.cpp
diff --git a/cpp/test/core/device_setter.cpp b/cpp/tests/core/device_setter.cpp
similarity index 100%
rename from cpp/test/core/device_setter.cpp
rename to cpp/tests/core/device_setter.cpp
diff --git a/cpp/test/core/handle.cpp b/cpp/tests/core/handle.cpp
similarity index 100%
rename from cpp/test/core/handle.cpp
rename to cpp/tests/core/handle.cpp
diff --git a/cpp/test/core/interruptible.cu b/cpp/tests/core/interruptible.cu
similarity index 100%
rename from cpp/test/core/interruptible.cu
rename to cpp/tests/core/interruptible.cu
diff --git a/cpp/test/core/logger.cpp b/cpp/tests/core/logger.cpp
similarity index 100%
rename from cpp/test/core/logger.cpp
rename to cpp/tests/core/logger.cpp
diff --git a/cpp/test/core/math_device.cu b/cpp/tests/core/math_device.cu
similarity index 100%
rename from cpp/test/core/math_device.cu
rename to cpp/tests/core/math_device.cu
diff --git a/cpp/test/core/math_host.cpp b/cpp/tests/core/math_host.cpp
similarity index 100%
rename from cpp/test/core/math_host.cpp
rename to cpp/tests/core/math_host.cpp
diff --git a/cpp/test/core/mdarray.cu b/cpp/tests/core/mdarray.cu
similarity index 100%
rename from cpp/test/core/mdarray.cu
rename to cpp/tests/core/mdarray.cu
diff --git a/cpp/test/core/mdbuffer.cu b/cpp/tests/core/mdbuffer.cu
similarity index 100%
rename from cpp/test/core/mdbuffer.cu
rename to cpp/tests/core/mdbuffer.cu
diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/tests/core/mdspan_copy.cpp
similarity index 100%
rename from cpp/test/core/mdspan_copy.cpp
rename to cpp/tests/core/mdspan_copy.cpp
diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/tests/core/mdspan_copy.cu
similarity index 100%
rename from cpp/test/core/mdspan_copy.cu
rename to cpp/tests/core/mdspan_copy.cu
diff --git a/cpp/test/core/mdspan_utils.cu b/cpp/tests/core/mdspan_utils.cu
similarity index 100%
rename from cpp/test/core/mdspan_utils.cu
rename to cpp/tests/core/mdspan_utils.cu
diff --git a/cpp/test/core/memory_type.cpp b/cpp/tests/core/memory_type.cpp
similarity index 100%
rename from cpp/test/core/memory_type.cpp
rename to cpp/tests/core/memory_type.cpp
diff --git a/cpp/test/core/numpy_serializer.cu b/cpp/tests/core/numpy_serializer.cu
similarity index 100%
rename from cpp/test/core/numpy_serializer.cu
rename to cpp/tests/core/numpy_serializer.cu
diff --git a/cpp/test/core/nvtx.cpp b/cpp/tests/core/nvtx.cpp
similarity index 100%
rename from cpp/test/core/nvtx.cpp
rename to cpp/tests/core/nvtx.cpp
diff --git a/cpp/test/core/operators_device.cu b/cpp/tests/core/operators_device.cu
similarity index 100%
rename from cpp/test/core/operators_device.cu
rename to cpp/tests/core/operators_device.cu
diff --git a/cpp/test/core/operators_host.cpp b/cpp/tests/core/operators_host.cpp
similarity index 100%
rename from cpp/test/core/operators_host.cpp
rename to cpp/tests/core/operators_host.cpp
diff --git a/cpp/test/core/seive.cu b/cpp/tests/core/seive.cu
similarity index 100%
rename from cpp/test/core/seive.cu
rename to cpp/tests/core/seive.cu
diff --git a/cpp/test/core/span.cpp b/cpp/tests/core/span.cpp
similarity index 100%
rename from cpp/test/core/span.cpp
rename to cpp/tests/core/span.cpp
diff --git a/cpp/test/core/span.cu b/cpp/tests/core/span.cu
similarity index 100%
rename from cpp/test/core/span.cu
rename to cpp/tests/core/span.cu
diff --git a/cpp/test/core/sparse_matrix.cpp b/cpp/tests/core/sparse_matrix.cpp
similarity index 100%
rename from cpp/test/core/sparse_matrix.cpp
rename to cpp/tests/core/sparse_matrix.cpp
diff --git a/cpp/test/core/sparse_matrix.cu b/cpp/tests/core/sparse_matrix.cu
similarity index 100%
rename from cpp/test/core/sparse_matrix.cu
rename to cpp/tests/core/sparse_matrix.cu
diff --git a/cpp/test/core/stream_view.cpp b/cpp/tests/core/stream_view.cpp
similarity index 100%
rename from cpp/test/core/stream_view.cpp
rename to cpp/tests/core/stream_view.cpp
diff --git a/cpp/test/core/temporary_device_buffer.cu b/cpp/tests/core/temporary_device_buffer.cu
similarity index 100%
rename from cpp/test/core/temporary_device_buffer.cu
rename to cpp/tests/core/temporary_device_buffer.cu
diff --git a/cpp/test/core/test_span.hpp b/cpp/tests/core/test_span.hpp
similarity index 100%
rename from cpp/test/core/test_span.hpp
rename to cpp/tests/core/test_span.hpp
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/tests/ext_headers/00_generate.py
similarity index 100%
rename from cpp/test/ext_headers/00_generate.py
rename to cpp/tests/ext_headers/00_generate.py
diff --git a/cpp/test/ext_headers/raft_core_logger.cpp b/cpp/tests/ext_headers/raft_core_logger.cpp
similarity index 100%
rename from cpp/test/ext_headers/raft_core_logger.cpp
rename to cpp/tests/ext_headers/raft_core_logger.cpp
diff --git a/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu b/cpp/tests/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
rename to cpp/tests/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
diff --git a/cpp/test/ext_headers/raft_distance_distance.cu b/cpp/tests/ext_headers/raft_distance_distance.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_distance_distance.cu
rename to cpp/tests/ext_headers/raft_distance_distance.cu
diff --git a/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu b/cpp/tests/ext_headers/raft_distance_fused_l2_nn.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
rename to cpp/tests/ext_headers/raft_distance_fused_l2_nn.cu
diff --git a/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu b/cpp/tests/ext_headers/raft_linalg_detail_coalesced_reduction.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
rename to cpp/tests/ext_headers/raft_linalg_detail_coalesced_reduction.cu
diff --git a/cpp/test/ext_headers/raft_matrix_detail_select_k.cu b/cpp/tests/ext_headers/raft_matrix_detail_select_k.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_matrix_detail_select_k.cu
rename to cpp/tests/ext_headers/raft_matrix_detail_select_k.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_ball_cover.cu b/cpp/tests/ext_headers/raft_neighbors_ball_cover.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_ball_cover.cu
rename to cpp/tests/ext_headers/raft_neighbors_ball_cover.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_brute_force.cu b/cpp/tests/ext_headers/raft_neighbors_brute_force.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_brute_force.cu
rename to cpp/tests/ext_headers/raft_neighbors_brute_force.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu b/cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
rename to cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu b/cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
rename to cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu b/cpp/tests/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
rename to cpp/tests/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu b/cpp/tests/ext_headers/raft_neighbors_ivf_flat.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
rename to cpp/tests/ext_headers/raft_neighbors_ivf_flat.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu b/cpp/tests/ext_headers/raft_neighbors_ivf_pq.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
rename to cpp/tests/ext_headers/raft_neighbors_ivf_pq.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_refine.cu b/cpp/tests/ext_headers/raft_neighbors_refine.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_refine.cu
rename to cpp/tests/ext_headers/raft_neighbors_refine.cu
diff --git a/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu b/cpp/tests/ext_headers/raft_sparse_matrix_detail_select_k.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
rename to cpp/tests/ext_headers/raft_sparse_matrix_detail_select_k.cu
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu b/cpp/tests/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
rename to cpp/tests/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu b/cpp/tests/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
rename to cpp/tests/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
diff --git a/cpp/test/label/label.cu b/cpp/tests/label/label.cu
similarity index 100%
rename from cpp/test/label/label.cu
rename to cpp/tests/label/label.cu
diff --git a/cpp/test/label/merge_labels.cu b/cpp/tests/label/merge_labels.cu
similarity index 100%
rename from cpp/test/label/merge_labels.cu
rename to cpp/tests/label/merge_labels.cu
diff --git a/cpp/test/lap/lap.cu b/cpp/tests/lap/lap.cu
similarity index 100%
rename from cpp/test/lap/lap.cu
rename to cpp/tests/lap/lap.cu
diff --git a/cpp/test/linalg/add.cu b/cpp/tests/linalg/add.cu
similarity index 100%
rename from cpp/test/linalg/add.cu
rename to cpp/tests/linalg/add.cu
diff --git a/cpp/test/linalg/add.cuh b/cpp/tests/linalg/add.cuh
similarity index 100%
rename from cpp/test/linalg/add.cuh
rename to cpp/tests/linalg/add.cuh
diff --git a/cpp/test/linalg/axpy.cu b/cpp/tests/linalg/axpy.cu
similarity index 100%
rename from cpp/test/linalg/axpy.cu
rename to cpp/tests/linalg/axpy.cu
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/tests/linalg/binary_op.cu
similarity index 100%
rename from cpp/test/linalg/binary_op.cu
rename to cpp/tests/linalg/binary_op.cu
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/tests/linalg/binary_op.cuh
similarity index 100%
rename from cpp/test/linalg/binary_op.cuh
rename to cpp/tests/linalg/binary_op.cuh
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/tests/linalg/cholesky_r1.cu
similarity index 100%
rename from cpp/test/linalg/cholesky_r1.cu
rename to cpp/tests/linalg/cholesky_r1.cu
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/tests/linalg/coalesced_reduction.cu
similarity index 100%
rename from cpp/test/linalg/coalesced_reduction.cu
rename to cpp/tests/linalg/coalesced_reduction.cu
diff --git a/cpp/test/linalg/divide.cu b/cpp/tests/linalg/divide.cu
similarity index 100%
rename from cpp/test/linalg/divide.cu
rename to cpp/tests/linalg/divide.cu
diff --git a/cpp/test/linalg/dot.cu b/cpp/tests/linalg/dot.cu
similarity index 100%
rename from cpp/test/linalg/dot.cu
rename to cpp/tests/linalg/dot.cu
diff --git a/cpp/test/linalg/eig.cu b/cpp/tests/linalg/eig.cu
similarity index 100%
rename from cpp/test/linalg/eig.cu
rename to cpp/tests/linalg/eig.cu
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/tests/linalg/eig_sel.cu
similarity index 100%
rename from cpp/test/linalg/eig_sel.cu
rename to cpp/tests/linalg/eig_sel.cu
diff --git a/cpp/test/linalg/eigen_solvers.cu b/cpp/tests/linalg/eigen_solvers.cu
similarity index 100%
rename from cpp/test/linalg/eigen_solvers.cu
rename to cpp/tests/linalg/eigen_solvers.cu
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/tests/linalg/eltwise.cu
similarity index 100%
rename from cpp/test/linalg/eltwise.cu
rename to cpp/tests/linalg/eltwise.cu
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/tests/linalg/gemm_layout.cu
similarity index 100%
rename from cpp/test/linalg/gemm_layout.cu
rename to cpp/tests/linalg/gemm_layout.cu
diff --git a/cpp/test/linalg/gemv.cu b/cpp/tests/linalg/gemv.cu
similarity index 100%
rename from cpp/test/linalg/gemv.cu
rename to cpp/tests/linalg/gemv.cu
diff --git a/cpp/test/linalg/map.cu b/cpp/tests/linalg/map.cu
similarity index 100%
rename from cpp/test/linalg/map.cu
rename to cpp/tests/linalg/map.cu
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/tests/linalg/map_then_reduce.cu
similarity index 100%
rename from cpp/test/linalg/map_then_reduce.cu
rename to cpp/tests/linalg/map_then_reduce.cu
diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/tests/linalg/matrix_vector.cu
similarity index 100%
rename from cpp/test/linalg/matrix_vector.cu
rename to cpp/tests/linalg/matrix_vector.cu
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/tests/linalg/matrix_vector_op.cu
similarity index 100%
rename from cpp/test/linalg/matrix_vector_op.cu
rename to cpp/tests/linalg/matrix_vector_op.cu
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/tests/linalg/matrix_vector_op.cuh
similarity index 100%
rename from cpp/test/linalg/matrix_vector_op.cuh
rename to cpp/tests/linalg/matrix_vector_op.cuh
diff --git a/cpp/test/linalg/mean_squared_error.cu b/cpp/tests/linalg/mean_squared_error.cu
similarity index 100%
rename from cpp/test/linalg/mean_squared_error.cu
rename to cpp/tests/linalg/mean_squared_error.cu
diff --git a/cpp/test/linalg/multiply.cu b/cpp/tests/linalg/multiply.cu
similarity index 100%
rename from cpp/test/linalg/multiply.cu
rename to cpp/tests/linalg/multiply.cu
diff --git a/cpp/test/linalg/norm.cu b/cpp/tests/linalg/norm.cu
similarity index 100%
rename from cpp/test/linalg/norm.cu
rename to cpp/tests/linalg/norm.cu
diff --git a/cpp/test/linalg/normalize.cu b/cpp/tests/linalg/normalize.cu
similarity index 100%
rename from cpp/test/linalg/normalize.cu
rename to cpp/tests/linalg/normalize.cu
diff --git a/cpp/test/linalg/power.cu b/cpp/tests/linalg/power.cu
similarity index 100%
rename from cpp/test/linalg/power.cu
rename to cpp/tests/linalg/power.cu
diff --git a/cpp/test/linalg/randomized_svd.cu b/cpp/tests/linalg/randomized_svd.cu
similarity index 100%
rename from cpp/test/linalg/randomized_svd.cu
rename to cpp/tests/linalg/randomized_svd.cu
diff --git a/cpp/test/linalg/reduce.cu b/cpp/tests/linalg/reduce.cu
similarity index 100%
rename from cpp/test/linalg/reduce.cu
rename to cpp/tests/linalg/reduce.cu
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/tests/linalg/reduce.cuh
similarity index 100%
rename from cpp/test/linalg/reduce.cuh
rename to cpp/tests/linalg/reduce.cuh
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/tests/linalg/reduce_cols_by_key.cu
similarity index 100%
rename from cpp/test/linalg/reduce_cols_by_key.cu
rename to cpp/tests/linalg/reduce_cols_by_key.cu
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/tests/linalg/reduce_rows_by_key.cu
similarity index 100%
rename from cpp/test/linalg/reduce_rows_by_key.cu
rename to cpp/tests/linalg/reduce_rows_by_key.cu
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/tests/linalg/rsvd.cu
similarity index 100%
rename from cpp/test/linalg/rsvd.cu
rename to cpp/tests/linalg/rsvd.cu
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/tests/linalg/sqrt.cu
similarity index 100%
rename from cpp/test/linalg/sqrt.cu
rename to cpp/tests/linalg/sqrt.cu
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/tests/linalg/strided_reduction.cu
similarity index 100%
rename from cpp/test/linalg/strided_reduction.cu
rename to cpp/tests/linalg/strided_reduction.cu
diff --git a/cpp/test/linalg/subtract.cu b/cpp/tests/linalg/subtract.cu
similarity index 100%
rename from cpp/test/linalg/subtract.cu
rename to cpp/tests/linalg/subtract.cu
diff --git a/cpp/test/linalg/svd.cu b/cpp/tests/linalg/svd.cu
similarity index 100%
rename from cpp/test/linalg/svd.cu
rename to cpp/tests/linalg/svd.cu
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/tests/linalg/ternary_op.cu
similarity index 100%
rename from cpp/test/linalg/ternary_op.cu
rename to cpp/tests/linalg/ternary_op.cu
diff --git a/cpp/test/linalg/transpose.cu b/cpp/tests/linalg/transpose.cu
similarity index 100%
rename from cpp/test/linalg/transpose.cu
rename to cpp/tests/linalg/transpose.cu
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/tests/linalg/unary_op.cu
similarity index 100%
rename from cpp/test/linalg/unary_op.cu
rename to cpp/tests/linalg/unary_op.cu
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/tests/linalg/unary_op.cuh
similarity index 100%
rename from cpp/test/linalg/unary_op.cuh
rename to cpp/tests/linalg/unary_op.cuh
diff --git a/cpp/test/matrix/argmax.cu b/cpp/tests/matrix/argmax.cu
similarity index 100%
rename from cpp/test/matrix/argmax.cu
rename to cpp/tests/matrix/argmax.cu
diff --git a/cpp/test/matrix/argmin.cu b/cpp/tests/matrix/argmin.cu
similarity index 100%
rename from cpp/test/matrix/argmin.cu
rename to cpp/tests/matrix/argmin.cu
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/tests/matrix/columnSort.cu
similarity index 100%
rename from cpp/test/matrix/columnSort.cu
rename to cpp/tests/matrix/columnSort.cu
diff --git a/cpp/test/matrix/diagonal.cu b/cpp/tests/matrix/diagonal.cu
similarity index 100%
rename from cpp/test/matrix/diagonal.cu
rename to cpp/tests/matrix/diagonal.cu
diff --git a/cpp/test/matrix/eye.cu b/cpp/tests/matrix/eye.cu
similarity index 100%
rename from cpp/test/matrix/eye.cu
rename to cpp/tests/matrix/eye.cu
diff --git a/cpp/test/matrix/gather.cu b/cpp/tests/matrix/gather.cu
similarity index 100%
rename from cpp/test/matrix/gather.cu
rename to cpp/tests/matrix/gather.cu
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/tests/matrix/linewise_op.cu
similarity index 100%
rename from cpp/test/matrix/linewise_op.cu
rename to cpp/tests/matrix/linewise_op.cu
diff --git a/cpp/test/matrix/math.cu b/cpp/tests/matrix/math.cu
similarity index 100%
rename from cpp/test/matrix/math.cu
rename to cpp/tests/matrix/math.cu
diff --git a/cpp/test/matrix/matrix.cu b/cpp/tests/matrix/matrix.cu
similarity index 100%
rename from cpp/test/matrix/matrix.cu
rename to cpp/tests/matrix/matrix.cu
diff --git a/cpp/test/matrix/norm.cu b/cpp/tests/matrix/norm.cu
similarity index 100%
rename from cpp/test/matrix/norm.cu
rename to cpp/tests/matrix/norm.cu
diff --git a/cpp/test/matrix/reverse.cu b/cpp/tests/matrix/reverse.cu
similarity index 100%
rename from cpp/test/matrix/reverse.cu
rename to cpp/tests/matrix/reverse.cu
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/tests/matrix/sample_rows.cu
similarity index 100%
rename from cpp/test/matrix/sample_rows.cu
rename to cpp/tests/matrix/sample_rows.cu
diff --git a/cpp/test/matrix/scatter.cu b/cpp/tests/matrix/scatter.cu
similarity index 100%
rename from cpp/test/matrix/scatter.cu
rename to cpp/tests/matrix/scatter.cu
diff --git a/cpp/test/matrix/select_k.cu b/cpp/tests/matrix/select_k.cu
similarity index 100%
rename from cpp/test/matrix/select_k.cu
rename to cpp/tests/matrix/select_k.cu
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/tests/matrix/select_k.cuh
similarity index 100%
rename from cpp/test/matrix/select_k.cuh
rename to cpp/tests/matrix/select_k.cuh
diff --git a/cpp/test/matrix/select_large_k.cu b/cpp/tests/matrix/select_large_k.cu
similarity index 100%
rename from cpp/test/matrix/select_large_k.cu
rename to cpp/tests/matrix/select_large_k.cu
diff --git a/cpp/test/matrix/slice.cu b/cpp/tests/matrix/slice.cu
similarity index 100%
rename from cpp/test/matrix/slice.cu
rename to cpp/tests/matrix/slice.cu
diff --git a/cpp/test/matrix/triangular.cu b/cpp/tests/matrix/triangular.cu
similarity index 100%
rename from cpp/test/matrix/triangular.cu
rename to cpp/tests/matrix/triangular.cu
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/tests/mr/device/buffer.cpp
similarity index 100%
rename from cpp/test/mr/device/buffer.cpp
rename to cpp/tests/mr/device/buffer.cpp
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/tests/mr/host/buffer.cpp
similarity index 100%
rename from cpp/test/mr/host/buffer.cpp
rename to cpp/tests/mr/host/buffer.cpp
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/tests/neighbors/ball_cover.cu
similarity index 100%
rename from cpp/test/neighbors/ball_cover.cu
rename to cpp/tests/neighbors/ball_cover.cu
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/tests/neighbors/epsilon_neighborhood.cu
similarity index 100%
rename from cpp/test/neighbors/epsilon_neighborhood.cu
rename to cpp/tests/neighbors/epsilon_neighborhood.cu
diff --git a/cpp/test/neighbors/haversine.cu b/cpp/tests/neighbors/haversine.cu
similarity index 100%
rename from cpp/test/neighbors/haversine.cu
rename to cpp/tests/neighbors/haversine.cu
diff --git a/cpp/test/neighbors/knn_utils.cuh b/cpp/tests/neighbors/knn_utils.cuh
similarity index 100%
rename from cpp/test/neighbors/knn_utils.cuh
rename to cpp/tests/neighbors/knn_utils.cuh
diff --git a/cpp/test/neighbors/spatial_data.h b/cpp/tests/neighbors/spatial_data.h
similarity index 100%
rename from cpp/test/neighbors/spatial_data.h
rename to cpp/tests/neighbors/spatial_data.h
diff --git a/cpp/test/random/excess_sampling.cu b/cpp/tests/random/excess_sampling.cu
similarity index 100%
rename from cpp/test/random/excess_sampling.cu
rename to cpp/tests/random/excess_sampling.cu
diff --git a/cpp/test/random/make_blobs.cu b/cpp/tests/random/make_blobs.cu
similarity index 100%
rename from cpp/test/random/make_blobs.cu
rename to cpp/tests/random/make_blobs.cu
diff --git a/cpp/test/random/make_regression.cu b/cpp/tests/random/make_regression.cu
similarity index 100%
rename from cpp/test/random/make_regression.cu
rename to cpp/tests/random/make_regression.cu
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/tests/random/multi_variable_gaussian.cu
similarity index 100%
rename from cpp/test/random/multi_variable_gaussian.cu
rename to cpp/tests/random/multi_variable_gaussian.cu
diff --git a/cpp/test/random/permute.cu b/cpp/tests/random/permute.cu
similarity index 100%
rename from cpp/test/random/permute.cu
rename to cpp/tests/random/permute.cu
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/tests/random/rmat_rectangular_generator.cu
similarity index 100%
rename from cpp/test/random/rmat_rectangular_generator.cu
rename to cpp/tests/random/rmat_rectangular_generator.cu
diff --git a/cpp/test/random/rng.cu b/cpp/tests/random/rng.cu
similarity index 100%
rename from cpp/test/random/rng.cu
rename to cpp/tests/random/rng.cu
diff --git a/cpp/test/random/rng_discrete.cu b/cpp/tests/random/rng_discrete.cu
similarity index 100%
rename from cpp/test/random/rng_discrete.cu
rename to cpp/tests/random/rng_discrete.cu
diff --git a/cpp/test/random/rng_int.cu b/cpp/tests/random/rng_int.cu
similarity index 100%
rename from cpp/test/random/rng_int.cu
rename to cpp/tests/random/rng_int.cu
diff --git a/cpp/test/random/rng_pcg_host_api.cu b/cpp/tests/random/rng_pcg_host_api.cu
similarity index 100%
rename from cpp/test/random/rng_pcg_host_api.cu
rename to cpp/tests/random/rng_pcg_host_api.cu
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/tests/random/sample_without_replacement.cu
similarity index 100%
rename from cpp/test/random/sample_without_replacement.cu
rename to cpp/tests/random/sample_without_replacement.cu
diff --git a/cpp/test/sparse/add.cu b/cpp/tests/sparse/add.cu
similarity index 100%
rename from cpp/test/sparse/add.cu
rename to cpp/tests/sparse/add.cu
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/tests/sparse/convert_coo.cu
similarity index 100%
rename from cpp/test/sparse/convert_coo.cu
rename to cpp/tests/sparse/convert_coo.cu
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/tests/sparse/convert_csr.cu
similarity index 100%
rename from cpp/test/sparse/convert_csr.cu
rename to cpp/tests/sparse/convert_csr.cu
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/tests/sparse/csr_row_slice.cu
similarity index 100%
rename from cpp/test/sparse/csr_row_slice.cu
rename to cpp/tests/sparse/csr_row_slice.cu
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/tests/sparse/csr_to_dense.cu
similarity index 100%
rename from cpp/test/sparse/csr_to_dense.cu
rename to cpp/tests/sparse/csr_to_dense.cu
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/tests/sparse/csr_transpose.cu
similarity index 100%
rename from cpp/test/sparse/csr_transpose.cu
rename to cpp/tests/sparse/csr_transpose.cu
diff --git a/cpp/test/sparse/degree.cu b/cpp/tests/sparse/degree.cu
similarity index 100%
rename from cpp/test/sparse/degree.cu
rename to cpp/tests/sparse/degree.cu
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/tests/sparse/dist_coo_spmv.cu
similarity index 100%
rename from cpp/test/sparse/dist_coo_spmv.cu
rename to cpp/tests/sparse/dist_coo_spmv.cu
diff --git a/cpp/test/sparse/distance.cu b/cpp/tests/sparse/distance.cu
similarity index 100%
rename from cpp/test/sparse/distance.cu
rename to cpp/tests/sparse/distance.cu
diff --git a/cpp/test/sparse/filter.cu b/cpp/tests/sparse/filter.cu
similarity index 100%
rename from cpp/test/sparse/filter.cu
rename to cpp/tests/sparse/filter.cu
diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/tests/sparse/masked_matmul.cu
similarity index 100%
rename from cpp/test/sparse/masked_matmul.cu
rename to cpp/tests/sparse/masked_matmul.cu
diff --git a/cpp/test/sparse/mst.cu b/cpp/tests/sparse/mst.cu
similarity index 100%
rename from cpp/test/sparse/mst.cu
rename to cpp/tests/sparse/mst.cu
diff --git a/cpp/test/sparse/norm.cu b/cpp/tests/sparse/norm.cu
similarity index 100%
rename from cpp/test/sparse/norm.cu
rename to cpp/tests/sparse/norm.cu
diff --git a/cpp/test/sparse/normalize.cu b/cpp/tests/sparse/normalize.cu
similarity index 100%
rename from cpp/test/sparse/normalize.cu
rename to cpp/tests/sparse/normalize.cu
diff --git a/cpp/test/sparse/reduce.cu b/cpp/tests/sparse/reduce.cu
similarity index 100%
rename from cpp/test/sparse/reduce.cu
rename to cpp/tests/sparse/reduce.cu
diff --git a/cpp/test/sparse/row_op.cu b/cpp/tests/sparse/row_op.cu
similarity index 100%
rename from cpp/test/sparse/row_op.cu
rename to cpp/tests/sparse/row_op.cu
diff --git a/cpp/test/sparse/sddmm.cu b/cpp/tests/sparse/sddmm.cu
similarity index 100%
rename from cpp/test/sparse/sddmm.cu
rename to cpp/tests/sparse/sddmm.cu
diff --git a/cpp/test/sparse/select_k_csr.cu b/cpp/tests/sparse/select_k_csr.cu
similarity index 100%
rename from cpp/test/sparse/select_k_csr.cu
rename to cpp/tests/sparse/select_k_csr.cu
diff --git a/cpp/test/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
similarity index 100%
rename from cpp/test/sparse/solver/lanczos.cu
rename to cpp/tests/sparse/solver/lanczos.cu
diff --git a/cpp/test/sparse/sort.cu b/cpp/tests/sparse/sort.cu
similarity index 100%
rename from cpp/test/sparse/sort.cu
rename to cpp/tests/sparse/sort.cu
diff --git a/cpp/test/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
similarity index 100%
rename from cpp/test/sparse/spectral_matrix.cu
rename to cpp/tests/sparse/spectral_matrix.cu
diff --git a/cpp/test/sparse/spgemmi.cu b/cpp/tests/sparse/spgemmi.cu
similarity index 100%
rename from cpp/test/sparse/spgemmi.cu
rename to cpp/tests/sparse/spgemmi.cu
diff --git a/cpp/test/sparse/spmm.cu b/cpp/tests/sparse/spmm.cu
similarity index 100%
rename from cpp/test/sparse/spmm.cu
rename to cpp/tests/sparse/spmm.cu
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/tests/sparse/symmetrize.cu
similarity index 100%
rename from cpp/test/sparse/symmetrize.cu
rename to cpp/tests/sparse/symmetrize.cu
diff --git a/cpp/test/stats/accuracy.cu b/cpp/tests/stats/accuracy.cu
similarity index 100%
rename from cpp/test/stats/accuracy.cu
rename to cpp/tests/stats/accuracy.cu
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/tests/stats/adjusted_rand_index.cu
similarity index 100%
rename from cpp/test/stats/adjusted_rand_index.cu
rename to cpp/tests/stats/adjusted_rand_index.cu
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/tests/stats/completeness_score.cu
similarity index 100%
rename from cpp/test/stats/completeness_score.cu
rename to cpp/tests/stats/completeness_score.cu
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/tests/stats/contingencyMatrix.cu
similarity index 100%
rename from cpp/test/stats/contingencyMatrix.cu
rename to cpp/tests/stats/contingencyMatrix.cu
diff --git a/cpp/test/stats/cov.cu b/cpp/tests/stats/cov.cu
similarity index 100%
rename from cpp/test/stats/cov.cu
rename to cpp/tests/stats/cov.cu
diff --git a/cpp/test/stats/dispersion.cu b/cpp/tests/stats/dispersion.cu
similarity index 100%
rename from cpp/test/stats/dispersion.cu
rename to cpp/tests/stats/dispersion.cu
diff --git a/cpp/test/stats/entropy.cu b/cpp/tests/stats/entropy.cu
similarity index 100%
rename from cpp/test/stats/entropy.cu
rename to cpp/tests/stats/entropy.cu
diff --git a/cpp/test/stats/histogram.cu b/cpp/tests/stats/histogram.cu
similarity index 100%
rename from cpp/test/stats/histogram.cu
rename to cpp/tests/stats/histogram.cu
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/tests/stats/homogeneity_score.cu
similarity index 100%
rename from cpp/test/stats/homogeneity_score.cu
rename to cpp/tests/stats/homogeneity_score.cu
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/tests/stats/information_criterion.cu
similarity index 100%
rename from cpp/test/stats/information_criterion.cu
rename to cpp/tests/stats/information_criterion.cu
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/tests/stats/kl_divergence.cu
similarity index 100%
rename from cpp/test/stats/kl_divergence.cu
rename to cpp/tests/stats/kl_divergence.cu
diff --git a/cpp/test/stats/mean.cu b/cpp/tests/stats/mean.cu
similarity index 100%
rename from cpp/test/stats/mean.cu
rename to cpp/tests/stats/mean.cu
diff --git a/cpp/test/stats/mean_center.cu b/cpp/tests/stats/mean_center.cu
similarity index 100%
rename from cpp/test/stats/mean_center.cu
rename to cpp/tests/stats/mean_center.cu
diff --git a/cpp/test/stats/meanvar.cu b/cpp/tests/stats/meanvar.cu
similarity index 100%
rename from cpp/test/stats/meanvar.cu
rename to cpp/tests/stats/meanvar.cu
diff --git a/cpp/test/stats/minmax.cu b/cpp/tests/stats/minmax.cu
similarity index 100%
rename from cpp/test/stats/minmax.cu
rename to cpp/tests/stats/minmax.cu
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/tests/stats/mutual_info_score.cu
similarity index 100%
rename from cpp/test/stats/mutual_info_score.cu
rename to cpp/tests/stats/mutual_info_score.cu
diff --git a/cpp/test/stats/r2_score.cu b/cpp/tests/stats/r2_score.cu
similarity index 100%
rename from cpp/test/stats/r2_score.cu
rename to cpp/tests/stats/r2_score.cu
diff --git a/cpp/test/stats/rand_index.cu b/cpp/tests/stats/rand_index.cu
similarity index 100%
rename from cpp/test/stats/rand_index.cu
rename to cpp/tests/stats/rand_index.cu
diff --git a/cpp/test/stats/regression_metrics.cu b/cpp/tests/stats/regression_metrics.cu
similarity index 100%
rename from cpp/test/stats/regression_metrics.cu
rename to cpp/tests/stats/regression_metrics.cu
diff --git a/cpp/test/stats/stddev.cu b/cpp/tests/stats/stddev.cu
similarity index 100%
rename from cpp/test/stats/stddev.cu
rename to cpp/tests/stats/stddev.cu
diff --git a/cpp/test/stats/sum.cu b/cpp/tests/stats/sum.cu
similarity index 100%
rename from cpp/test/stats/sum.cu
rename to cpp/tests/stats/sum.cu
diff --git a/cpp/test/stats/v_measure.cu b/cpp/tests/stats/v_measure.cu
similarity index 100%
rename from cpp/test/stats/v_measure.cu
rename to cpp/tests/stats/v_measure.cu
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/tests/stats/weighted_mean.cu
similarity index 100%
rename from cpp/test/stats/weighted_mean.cu
rename to cpp/tests/stats/weighted_mean.cu
diff --git a/cpp/test/test.cpp b/cpp/tests/test.cpp
similarity index 100%
rename from cpp/test/test.cpp
rename to cpp/tests/test.cpp
diff --git a/cpp/test/test_utils.cuh b/cpp/tests/test_utils.cuh
similarity index 100%
rename from cpp/test/test_utils.cuh
rename to cpp/tests/test_utils.cuh
diff --git a/cpp/test/test_utils.h b/cpp/tests/test_utils.h
similarity index 100%
rename from cpp/test/test_utils.h
rename to cpp/tests/test_utils.h
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/tests/util/bitonic_sort.cu
similarity index 100%
rename from cpp/test/util/bitonic_sort.cu
rename to cpp/tests/util/bitonic_sort.cu
diff --git a/cpp/test/util/cudart_utils.cpp b/cpp/tests/util/cudart_utils.cpp
similarity index 100%
rename from cpp/test/util/cudart_utils.cpp
rename to cpp/tests/util/cudart_utils.cpp
diff --git a/cpp/test/util/device_atomics.cu b/cpp/tests/util/device_atomics.cu
similarity index 100%
rename from cpp/test/util/device_atomics.cu
rename to cpp/tests/util/device_atomics.cu
diff --git a/cpp/test/util/integer_utils.cpp b/cpp/tests/util/integer_utils.cpp
similarity index 100%
rename from cpp/test/util/integer_utils.cpp
rename to cpp/tests/util/integer_utils.cpp
diff --git a/cpp/test/util/integer_utils.cu b/cpp/tests/util/integer_utils.cu
similarity index 100%
rename from cpp/test/util/integer_utils.cu
rename to cpp/tests/util/integer_utils.cu
diff --git a/cpp/test/util/memory_type_dispatcher.cu b/cpp/tests/util/memory_type_dispatcher.cu
similarity index 100%
rename from cpp/test/util/memory_type_dispatcher.cu
rename to cpp/tests/util/memory_type_dispatcher.cu
diff --git a/cpp/test/util/popc.cu b/cpp/tests/util/popc.cu
similarity index 100%
rename from cpp/test/util/popc.cu
rename to cpp/tests/util/popc.cu
diff --git a/cpp/test/util/pow2_utils.cu b/cpp/tests/util/pow2_utils.cu
similarity index 100%
rename from cpp/test/util/pow2_utils.cu
rename to cpp/tests/util/pow2_utils.cu
diff --git a/cpp/test/util/reduction.cu b/cpp/tests/util/reduction.cu
similarity index 100%
rename from cpp/test/util/reduction.cu
rename to cpp/tests/util/reduction.cu
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 6240b2638b..1a2626f2b2 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -211,7 +211,7 @@ This will bring up an interactive prompt to select which spelling fixes to apply
 
 Manually, run the following to bulk-fix include style issues:
 ```bash
-python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list of folders which you want to fix]
+python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/tests ... list of folders which you want to fix]
 ```
 
 ### Copyright header
@@ -298,9 +298,9 @@ RAFT is a heavily templated library. Several core functions are expensive to com
 
 **Macros.** We define the macros `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY`. The `RAFT_COMPILED` macro is defined by `CMake` when compiling code that (1) is part of `libraft.so` or (2) is linked with `libraft.so`. It indicates that a precompiled `libraft.so` is present at runtime.
 
-The `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro is defined by `CMake` during compilation of `libraft.so` itself. When defined, it indicates that implicit instantiations of expensive function templates are forbidden (they result in a compiler error). In the RAFT project, we additionally define this macro during compilation of the tests and benchmarks. 
+The `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro is defined by `CMake` during compilation of `libraft.so` itself. When defined, it indicates that implicit instantiations of expensive function templates are forbidden (they result in a compiler error). In the RAFT project, we additionally define this macro during compilation of the tests and benchmarks.
 
-Below, we summarize which combinations of `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` are used in practice and what the effect of the combination is. 
+Below, we summarize which combinations of `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` are used in practice and what the effect of the combination is.
 
 | RAFT_COMPILED | RAFT_EXPLICIT_INSTANTIATE_ONLY | Which targets                                                                                        |
 |---------------|--------------------------------|------------------------------------------------------------------------------------------------------|
@@ -349,7 +349,7 @@ The file `expensive-ext.cuh` contains the following:
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 namespace raft {
-// (1) define templates to raise an error in case of accidental instantiation 
+// (1) define templates to raise an error in case of accidental instantiation
 template <typename T> void expensive(T arg) RAFT_EXPLICIT;
 } // namespace raft
 #endif //RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -371,7 +371,7 @@ template void raft::expensive<int>(int);
 template void raft::expensive<float>(float);
 ```
 
-**Design considerations**: 
+**Design considerations**:
 
 1. In the `-ext.cuh` header, do not include implementation headers. Only include function parameter types and types that are used to instantiate the templates. If a primitive takes custom parameter types, define them in a separate header called `<primitive_name>_types.hpp`. (see [Common Design Considerations](https://github.com/rapidsai/raft/blob/7b065aff81a0b1976e2a9e2f3de6690361a1111b/docs/source/developer_guide.md#common-design-considerations)).
 
@@ -381,7 +381,7 @@ template void raft::expensive<float>(float);
 
 4. If a header file defines multiple expensive templates, it can be that one of them is not instantiated. In this case, **do define** the template with `RAFT_EXPLICIT` in the `-ext` header. This way, when the template is instantiated, the developer gets a helpful error message instead of a confusing "function not found".
 
-This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation. 
+This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation.
 
 ## Testing
 
diff --git a/pyproject.toml b/pyproject.toml
index 2f23debfbe..460c0312a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ ignore_missing_imports = true
 # they are imported by a checked file.
 follow_imports = "skip"
 exclude = [
-    "pylibraft/pylibraft/test",
+    "pylibraft/pylibraft/tests",
  ]
 
 [tool.codespell]
diff --git a/python/pylibraft/pylibraft/test/__init__py b/python/pylibraft/pylibraft/tests/__init__py
similarity index 100%
rename from python/pylibraft/pylibraft/test/__init__py
rename to python/pylibraft/pylibraft/tests/__init__py
diff --git a/python/pylibraft/pylibraft/test/pytest.ini b/python/pylibraft/pylibraft/tests/pytest.ini
similarity index 100%
rename from python/pylibraft/pylibraft/test/pytest.ini
rename to python/pylibraft/pylibraft/tests/pytest.ini
diff --git a/python/pylibraft/pylibraft/test/test_cai_wrapper.py b/python/pylibraft/pylibraft/tests/test_cai_wrapper.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_cai_wrapper.py
rename to python/pylibraft/pylibraft/tests/test_cai_wrapper.py
diff --git a/python/pylibraft/pylibraft/test/test_config.py b/python/pylibraft/pylibraft/tests/test_config.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_config.py
rename to python/pylibraft/pylibraft/tests/test_config.py
diff --git a/python/pylibraft/pylibraft/test/test_device_ndarray.py b/python/pylibraft/pylibraft/tests/test_device_ndarray.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_device_ndarray.py
rename to python/pylibraft/pylibraft/tests/test_device_ndarray.py
diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/tests/test_doctests.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_doctests.py
rename to python/pylibraft/pylibraft/tests/test_doctests.py
diff --git a/python/pylibraft/pylibraft/test/test_handle.py b/python/pylibraft/pylibraft/tests/test_handle.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_handle.py
rename to python/pylibraft/pylibraft/tests/test_handle.py
diff --git a/python/pylibraft/pylibraft/test/test_mdspan_serializer.py b/python/pylibraft/pylibraft/tests/test_mdspan_serializer.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_mdspan_serializer.py
rename to python/pylibraft/pylibraft/tests/test_mdspan_serializer.py
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/tests/test_random.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_random.py
rename to python/pylibraft/pylibraft/tests/test_random.py
diff --git a/python/pylibraft/pylibraft/test/test_sparse.py b/python/pylibraft/pylibraft/tests/test_sparse.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_sparse.py
rename to python/pylibraft/pylibraft/tests/test_sparse.py
diff --git a/python/pylibraft/pylibraft/test/test_version.py b/python/pylibraft/pylibraft/tests/test_version.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_version.py
rename to python/pylibraft/pylibraft/tests/test_version.py
diff --git a/python/pylibraft/pylibraft/test/test_z_interruptible.py b/python/pylibraft/pylibraft/tests/test_z_interruptible.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_z_interruptible.py
rename to python/pylibraft/pylibraft/tests/test_z_interruptible.py
diff --git a/python/raft-dask/raft_dask/test/conftest.py b/python/raft-dask/raft_dask/tests/conftest.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/conftest.py
rename to python/raft-dask/raft_dask/tests/conftest.py
diff --git a/python/raft-dask/raft_dask/test/pytest.ini b/python/raft-dask/raft_dask/tests/pytest.ini
similarity index 100%
rename from python/raft-dask/raft_dask/test/pytest.ini
rename to python/raft-dask/raft_dask/tests/pytest.ini
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/tests/test_comms.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/test_comms.py
rename to python/raft-dask/raft_dask/tests/test_comms.py
diff --git a/python/raft-dask/raft_dask/test/test_raft.py b/python/raft-dask/raft_dask/tests/test_raft.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/test_raft.py
rename to python/raft-dask/raft_dask/tests/test_raft.py
diff --git a/python/raft-dask/raft_dask/test/test_version.py b/python/raft-dask/raft_dask/tests/test_version.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/test_version.py
rename to python/raft-dask/raft_dask/tests/test_version.py

From 85fd74dd32cd10c9ff6bfa73077b7e693a5e22dd Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 24 Jan 2025 03:38:29 -0500
Subject: [PATCH 29/37] Add cuda 12.8 support (#2551)

CUDA 12.8 introduces sm_120 that requires a reduced number of threads per sm.

We also need to pass `-static-global-template-stub=false` when building with 12.8 as we violate CUDA ODR kernel rules

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Corey J. Nolet (https://github.com/cjnolet)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/2551
---
 cpp/cmake/modules/ConfigureCUDA.cmake            | 8 +++++++-
 cpp/include/raft/neighbors/detail/nn_descent.cuh | 5 +++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 25b9b0ddf8..fbf4428650 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -29,6 +29,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
     list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
   endif()
+
+  # Allow invalid CUDA kernels in the short term
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
+    list(APPEND RAFT_CUDA_FLAGS -static-global-template-stub=false)
+  endif()
+
 endif()
 
 if(CUDA_LOG_COMPILE_TIME)
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index 02610f9afb..64e4a3ea7a 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -709,7 +709,8 @@ template <typename Index_t,
           typename epilogue_op = DistEpilogue<Index_t, DistData_t>>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
-#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890)
+#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890) || \
+  (__CUDA_ARCH__) == 1200
 __launch_bounds__(BLOCK_SIZE)
 #else
 __launch_bounds__(BLOCK_SIZE, 4)

From 14c92cc37561944fbc76b7d511122b8d2bca627e Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 24 Jan 2025 08:48:58 -0600
Subject: [PATCH 30/37] Update pip devcontainers to UCX 1.18 (#2550)

Contributes to https://github.com/rapidsai/build-planning/issues/138

Updates to using UCX 1.18 in pip devcontainers here.

Also updates `rapids-dependency-file-generator` and `pre-commit-hooks` hooks to their latest versions.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/raft/pull/2550
---
 .devcontainer/cuda11.8-pip/devcontainer.json | 2 +-
 .devcontainer/cuda12.5-pip/devcontainer.json | 2 +-
 .pre-commit-config.yaml                      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index c691ed6007..94b0909f6c 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index bc43900ef3..2bcfa8733f 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ca1efc3abd..4e0cf53c4d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -98,7 +98,7 @@ repos:
                     ^CHANGELOG[.]md$|
                     ^cpp/cmake/patches/cutlass/build-export[.]patch$
       - repo: https://github.com/pre-commit/pre-commit-hooks
-        rev: v4.5.0
+        rev: v5.0.0
         hooks:
               - id: check-json
       - repo: https://github.com/rapidsai/pre-commit-hooks
@@ -118,7 +118,7 @@ repos:
                   docs/source/sphinxext/github_link[.]py|
           - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.16.0
+        rev: v1.17.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]

From fc4c49010499f2d6954c995495ee14afbb7c4a90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Fri, 24 Jan 2025 19:17:11 +0100
Subject: [PATCH 31/37] Fix bit order of RMAT Rectangular Generator to match
 expectation (#2542)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the decimal indices of rows/columns of the adjacency matrix did not align with the node-ids created by the algorithm. This PR fixes the bits set for each decision during the computation as described by the docstring.

FYI @tfeher

Authors:
  - Malte Förster (https://github.com/mfoerste4)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/raft/pull/2542
---
 .../detail/rmat_rectangular_generator.cuh     |  4 +-
 .../random/rmat_rectangular_generator.cuh     | 27 +++--
 .../random/rmat_rectangular_generator.cu      | 98 ++++++++++++++++++-
 3 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index 24207ba6db..12c01fc5d7 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -54,8 +54,8 @@ DI void gen_and_update_bits(IdxT& src_id,
   } else {
     src_bit = dst_bit = true;
   }
-  if (curr_depth < r_scale) { src_id |= (IdxT(src_bit) << (r_scale - curr_depth - 1)); }
-  if (curr_depth < c_scale) { dst_id |= (IdxT(dst_bit) << (c_scale - curr_depth - 1)); }
+  if (curr_depth < r_scale) { src_id |= (IdxT(src_bit) << (curr_depth)); }
+  if (curr_depth < c_scale) { dst_id |= (IdxT(dst_bit) << (curr_depth)); }
 }
 
 template <typename IdxT>
diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh
index 5598b25c8e..cdd89f40dd 100644
--- a/cpp/include/raft/random/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh
@@ -30,8 +30,18 @@ namespace raft::random {
 /**
  * @brief Generate a bipartite RMAT graph for a rectangular adjacency matrix.
  *
- * This is the most general of several overloads of `rmat_rectangular_gen`
- * in this file, and thus has the most detailed documentation.
+ * This function generates a random graph represented by a (sparse) adjacency matrix. As described
+ * in [1], to generate connections, we recursively subdivide the adjacency matrix into four
+ * equal-sized partitions, and distribute edges within these partitions with a unequal
+ * probabilities. The probabilities are described by numbers [a, b, c, d]. We chose the upper left
+ * partition with probability `a`. The chosen partition is again subdivided into four smaller
+ * partitions, and the procedure is repeated until we reach a single element (1 x 1 partition).
+ *
+ * We can prescribe different probability distribution at each iteariton. The `theta` array stores
+ * the probability values for each level.
+ *
+ * [1] "R-MAT: A Recursive Model for Graph Mining" Deepayan Chakrabarti, Yiping Zhan, Christos
+ * Faloutsos (2004) https://doi.org/10.1137/1.9781611972740.43
  *
  * @tparam IdxT  Type of each node index
  * @tparam ProbT Data type used for probability distributions (either fp32 or fp64)
@@ -49,11 +59,14 @@ namespace raft::random {
  * @param[out] out_dst Destination node id's [on device].  `out_src` and `out_dst`
  *                     together form the struct-of-arrays representation of the same
  *                     output data as `out`.
- * @param[in]  theta   distribution of each quadrant at each level of resolution.
- *                     Since these are probabilities, each of the 2x2 matrices for
- *                     each level of the RMAT must sum to one. [on device]
- *                     [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed
- *                     that each of the group of 2 x 2 numbers all sum up to 1.
+ * @param[in]  theta   array [on device] with the distribution of each quadrant at each level of
+ *                     resolution. theta = [a0, b0, c0, d0, a1, b1, c1, d1, ...], where
+ *                     [a0, b0, c0, d0]  defines the probability at the finest level (2x2).
+ *                     The last four elements in the array describe the probability in the
+ *                     coarsest level (where matrix size = [2^r_scale, 2^c_scale]).
+ *                     Since these are probabilities, the four [a_i, b_i, c_i, d_i] values for
+ *                     each level of the RMAT must sum to one.
+ *                     [dim = max(r_scale, c_scale) x 2 x 2].
  * @param[in]  r_scale 2^r_scale represents the number of source nodes
  * @param[in]  c_scale 2^c_scale represents the number of destination nodes
  *
diff --git a/cpp/tests/random/rmat_rectangular_generator.cu b/cpp/tests/random/rmat_rectangular_generator.cu
index 8d668f7a8a..10c00051b6 100644
--- a/cpp/tests/random/rmat_rectangular_generator.cu
+++ b/cpp/tests/random/rmat_rectangular_generator.cu
@@ -155,10 +155,10 @@ RAFT_KERNEL compute_hist(
   size_t idx = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
   if (idx + 1 < len) {
     auto src = out[idx], dst = out[idx + 1];
-    for (size_t j = 0; j < max_scale; ++j) {
-      bool src_bit = j < r_scale ? src & (1 << (r_scale - j - 1)) : 0;
-      bool dst_bit = j < c_scale ? dst & (1 << (c_scale - j - 1)) : 0;
-      auto idx     = j * 4 + src_bit * 2 + dst_bit;
+    for (size_t bit_pos = 0; bit_pos < max_scale; ++bit_pos) {
+      bool src_bit = bit_pos < r_scale ? src & (1 << bit_pos) : 0;
+      bool dst_bit = bit_pos < c_scale ? dst & (1 << bit_pos) : 0;
+      auto idx     = bit_pos * 4 + src_bit * 2 + dst_bit;
       atomicAdd(hist + idx, 1);
     }
   }
@@ -393,11 +393,101 @@ const std::vector<RmatInputs> inputs = {
   {18, 16, 200000, false, 456789ULL, TOLERANCE},
   {18, 16, 200000, true, 456789ULL, TOLERANCE}};
 
+struct RmatForcedOutputs {
+  size_t r_scale;
+  size_t c_scale;
+  size_t r_node_id;
+  size_t c_node_id;
+};
+
+class RmatGenForceTest : public ::testing::TestWithParam<RmatForcedOutputs> {
+ public:
+  RmatGenForceTest()
+    : handle{},
+      stream{resource::get_cuda_stream(handle)},
+      params{::testing::TestWithParam<RmatForcedOutputs>::GetParam()},
+      out{2, stream},
+      out_src{1, stream},
+      out_dst{1, stream},
+      theta{0, stream},
+      h_theta{},
+      state{0, GeneratorType::GenPC},
+      max_scale(std::max(params.r_scale, params.c_scale))
+  {
+    theta.resize(4 * max_scale, stream);
+    h_theta.resize(theta.size(), 0.f);
+    for (size_t bit_pos = 0; bit_pos < max_scale; ++bit_pos) {
+      size_t row_bit = ((params.r_node_id & (1 << bit_pos)) != 0);
+      size_t col_bit = ((params.c_node_id & (1 << bit_pos)) != 0);
+
+      // now force theta for bit -- 2x2 matrix row major
+      h_theta[4 * bit_pos + row_bit * 2 + col_bit] = 1.f;
+    }
+
+    raft::update_device(theta.data(), h_theta.data(), max_scale * 4, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  void SetUp() override
+  {
+    rmat_rectangular_gen(out.data(),
+                         out_src.data(),
+                         out_dst.data(),
+                         theta.data(),
+                         params.r_scale,
+                         params.c_scale,
+                         size_t(1),
+                         stream,
+                         state);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  void validate()
+  {
+    std::vector<size_t> h_out(2, size_t(0));
+    raft::update_host(h_out.data(), out.data(), 2, stream);
+    RAFT_CUDA_TRY(cudaGetLastError());
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+    std::vector<size_t> h_out_expect;
+    h_out_expect.push_back(params.r_node_id);
+    h_out_expect.push_back(params.c_node_id);
+
+    ASSERT_TRUE(hostVecMatch(h_out_expect, h_out, raft::Compare<size_t>()));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  RmatForcedOutputs params;
+  size_t max_scale;
+  std::vector<float> h_theta;
+  rmm::device_uvector<size_t> out, out_src, out_dst;
+  rmm::device_uvector<float> theta;
+  RngState state;
+};
+
+const std::vector<RmatForcedOutputs> forcedInputs = {{16, 16, 12425, 1233},
+                                                     {16, 16, 12, 424},
+                                                     {5, 5, 15, 15},
+                                                     {5, 6, 15, 15},
+                                                     {5, 15, 15, 15},
+                                                     {6, 5, 15, 15},
+                                                     {15, 5, 15, 15},
+                                                     {32, 16, 1253163, 60000},
+                                                     {16, 16, 12, 0},
+                                                     {16, 16, 0, 1255}};
+
 TEST_P(RmatGenTest, Result) { validate(); }
 INSTANTIATE_TEST_SUITE_P(RmatGenTests, RmatGenTest, ::testing::ValuesIn(inputs));
 
 TEST_P(RmatGenMdspanTest, Result) { validate(); }
 INSTANTIATE_TEST_SUITE_P(RmatGenMdspanTests, RmatGenMdspanTest, ::testing::ValuesIn(inputs));
 
+TEST_P(RmatGenForceTest, Result) { validate(); }
+INSTANTIATE_TEST_SUITE_P(RmatGenForceTests, RmatGenForceTest, ::testing::ValuesIn(forcedInputs));
+
 }  // namespace random
 }  // namespace raft

From ef4a7e1acd5151e60d2489ecef4991605891f008 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 24 Jan 2025 19:44:20 -0600
Subject: [PATCH 32/37] Normalize whitespace (#2547)

This PR applies `pre-commit` hooks to normalize whitespace (trimming trailing whitespace and enforcing consistent end-of-file newlines).

These rules are already applied to most other RAPIDS repos, so this PR aligns with the norm in RAPIDS.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/raft/pull/2547
---
 .pre-commit-config.yaml                                |  7 ++++++-
 build.sh                                               |  1 -
 cpp/cmake/patches/cutlass/build-export.patch           |  5 ++---
 cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh |  2 +-
 cpp/include/raft/cluster/detail/mst.cuh                |  2 +-
 cpp/include/raft/cluster/detail/single_linkage.cuh     |  2 +-
 cpp/include/raft/common/nvtx.hpp                       |  2 +-
 cpp/include/raft/core/coo_matrix.hpp                   |  2 +-
 cpp/include/raft/core/csr_matrix.hpp                   |  2 +-
 cpp/include/raft/core/detail/mdspan_util.cuh           |  2 +-
 cpp/include/raft/core/device_coo_matrix.hpp            |  2 +-
 cpp/include/raft/core/device_csr_matrix.hpp            |  2 +-
 cpp/include/raft/core/device_span.hpp                  |  2 +-
 cpp/include/raft/core/host_coo_matrix.hpp              |  2 +-
 cpp/include/raft/core/host_csr_matrix.hpp              |  2 +-
 cpp/include/raft/core/host_mdarray.hpp                 |  2 +-
 cpp/include/raft/core/host_span.hpp                    |  2 +-
 cpp/include/raft/core/resource/device_id.hpp           |  2 +-
 cpp/include/raft/core/resource/device_properties.hpp   |  2 +-
 cpp/include/raft/core/resource/sub_comms.hpp           |  2 +-
 cpp/include/raft/core/sparse_types.hpp                 |  2 +-
 .../raft/distance/detail/fused_distance_nn/gemm.h      |  2 +-
 .../raft/distance/detail/pairwise_distance_gemm.h      |  2 +-
 cpp/include/raft/distance/fused_distance_nn.cuh        |  2 +-
 cpp/include/raft/label/classlabels.cuh                 |  2 +-
 cpp/include/raft/label/detail/merge_labels.cuh         |  2 +-
 cpp/include/raft/label/merge_labels.cuh                |  2 +-
 cpp/include/raft/linalg/cholesky_r1_update.cuh         |  2 +-
 cpp/include/raft/linalg/coalesced_reduction.cuh        |  2 +-
 cpp/include/raft/linalg/detail/add.cuh                 |  2 +-
 cpp/include/raft/linalg/divide.cuh                     |  2 +-
 cpp/include/raft/linalg/eig.cuh                        |  2 +-
 cpp/include/raft/linalg/eltwise.cuh                    |  2 +-
 cpp/include/raft/linalg/gemv.cuh                       |  2 +-
 cpp/include/raft/linalg/linalg_types.hpp               |  2 +-
 cpp/include/raft/linalg/lstsq.cuh                      |  2 +-
 cpp/include/raft/linalg/map_reduce.cuh                 |  2 +-
 cpp/include/raft/linalg/map_then_reduce.cuh            |  2 +-
 cpp/include/raft/linalg/matrix_vector.cuh              |  2 +-
 cpp/include/raft/linalg/multiply.cuh                   |  2 +-
 cpp/include/raft/linalg/power.cuh                      |  2 +-
 cpp/include/raft/linalg/qr.cuh                         |  2 +-
 cpp/include/raft/linalg/reduce.cuh                     |  2 +-
 cpp/include/raft/linalg/reduce_cols_by_key.cuh         |  2 +-
 cpp/include/raft/linalg/reduce_rows_by_key.cuh         |  2 +-
 cpp/include/raft/linalg/rsvd.cuh                       |  2 +-
 cpp/include/raft/linalg/sqrt.cuh                       |  2 +-
 cpp/include/raft/linalg/strided_reduction.cuh          |  2 +-
 cpp/include/raft/linalg/subtract.cuh                   |  2 +-
 cpp/include/raft/linalg/svd.cuh                        |  2 +-
 cpp/include/raft/matrix/col_wise_sort.cuh              |  2 +-
 cpp/include/raft/matrix/detail/gather_inplace.cuh      |  2 +-
 cpp/include/raft/matrix/detail/scatter_inplace.cuh     |  2 +-
 cpp/include/raft/matrix/math.hpp                       |  2 +-
 cpp/include/raft/matrix/norm.cuh                       |  2 +-
 cpp/include/raft/matrix/reverse.cuh                    |  2 +-
 cpp/include/raft/matrix/scatter.cuh                    |  2 +-
 .../neighbors/detail/cagra/compute_distance_vpq.cuh    |  2 +-
 cpp/include/raft/neighbors/detail/div_utils.hpp        |  2 +-
 cpp/include/raft/neighbors/ivf_flat_codepacker.hpp     |  2 +-
 cpp/include/raft/random/detail/curand_wrappers.hpp     |  2 +-
 cpp/include/raft/random/detail/permute.cuh             |  2 +-
 cpp/include/raft/random/make_blobs.cuh                 |  2 +-
 cpp/include/raft/random/sample_without_replacement.cuh |  2 +-
 cpp/include/raft/solver/linear_assignment.cuh          |  2 +-
 cpp/include/raft/sparse/convert/coo.cuh                |  2 +-
 cpp/include/raft/sparse/convert/dense.cuh              |  2 +-
 cpp/include/raft/sparse/convert/detail/coo.cuh         |  2 +-
 cpp/include/raft/sparse/convert/detail/dense.cuh       |  2 +-
 cpp/include/raft/sparse/detail/cusparse_macros.h       |  2 +-
 cpp/include/raft/sparse/distance/detail/common.hpp     |  2 +-
 .../coo_spmv_strategies/coo_mask_row_iterators.cuh     |  2 +-
 .../detail/coo_spmv_strategies/dense_smem_strategy.cuh |  2 +-
 cpp/include/raft/sparse/distance/distance.cuh          |  2 +-
 cpp/include/raft/sparse/linalg/add.cuh                 |  2 +-
 cpp/include/raft/sparse/linalg/degree.cuh              |  2 +-
 cpp/include/raft/sparse/linalg/detail/norm.cuh         |  2 +-
 cpp/include/raft/sparse/linalg/detail/transpose.h      |  2 +-
 cpp/include/raft/sparse/linalg/norm.cuh                |  2 +-
 cpp/include/raft/sparse/linalg/spectral.cuh            |  2 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh          |  2 +-
 cpp/include/raft/sparse/linalg/transpose.cuh           |  2 +-
 .../raft/sparse/neighbors/cross_component_nn.cuh       |  2 +-
 cpp/include/raft/sparse/op/filter.cuh                  |  2 +-
 cpp/include/raft/sparse/op/reduce.cuh                  |  2 +-
 cpp/include/raft/sparse/op/row_op.cuh                  |  2 +-
 cpp/include/raft/sparse/op/slice.cuh                   |  2 +-
 cpp/include/raft/sparse/solver/lanczos.cuh             |  2 +-
 cpp/include/raft/spectral/cluster_solvers.cuh          |  2 +-
 .../raft/spectral/cluster_solvers_deprecated.cuh       |  2 +-
 cpp/include/raft/spectral/modularity_maximization.cuh  |  2 +-
 cpp/include/raft/spectral/partition.cuh                |  2 +-
 cpp/include/raft/stats/accuracy.cuh                    |  2 +-
 cpp/include/raft/stats/adjusted_rand_index.cuh         |  2 +-
 cpp/include/raft/stats/completeness_score.cuh          |  2 +-
 cpp/include/raft/stats/contingency_matrix.cuh          |  2 +-
 cpp/include/raft/stats/cov.cuh                         |  2 +-
 cpp/include/raft/stats/detail/mean.cuh                 |  2 +-
 cpp/include/raft/stats/detail/stddev.cuh               |  2 +-
 cpp/include/raft/stats/detail/sum.cuh                  |  2 +-
 cpp/include/raft/stats/detail/weighted_mean.cuh        |  2 +-
 cpp/include/raft/stats/dispersion.cuh                  |  2 +-
 cpp/include/raft/stats/entropy.cuh                     |  2 +-
 cpp/include/raft/stats/homogeneity_score.cuh           |  2 +-
 cpp/include/raft/stats/mean.cuh                        |  2 +-
 cpp/include/raft/stats/mean_center.cuh                 |  2 +-
 cpp/include/raft/stats/minmax.cuh                      |  2 +-
 cpp/include/raft/stats/mutual_info_score.cuh           |  2 +-
 cpp/include/raft/stats/r2_score.cuh                    |  2 +-
 cpp/include/raft/stats/rand_index.cuh                  |  2 +-
 cpp/include/raft/stats/regression_metrics.cuh          |  2 +-
 cpp/include/raft/stats/silhouette_score.cuh            |  2 +-
 cpp/include/raft/stats/stddev.cuh                      |  2 +-
 cpp/include/raft/stats/sum.cuh                         |  2 +-
 cpp/include/raft/stats/trustworthiness_score.cuh       |  2 +-
 cpp/include/raft/stats/v_measure.cuh                   |  2 +-
 cpp/include/raft/stats/weighted_mean.cuh               |  2 +-
 .../raft/thirdparty/mdspan/.github/workflows/cmake.yml | 10 +++++-----
 cpp/include/raft/thirdparty/mdspan/LICENSE             |  8 ++++----
 cpp/include/raft/thirdparty/mdspan/README.md           |  1 -
 .../mdspan/benchmarks/sum/cuda/CMakeLists.txt          |  2 +-
 .../mdspan/benchmarks/sum/openmp/CMakeLists.txt        |  2 +-
 .../mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp     |  1 -
 .../mdspan/benchmarks/sum/sum_submdspan_right.cpp      |  1 -
 .../compilation_tests/ctest_compressed_pair_layout.cpp |  1 -
 .../mdspan/compilation_tests/ctest_extents_ctors.cpp   |  1 -
 .../compilation_tests/ctest_layout_convertible.cpp     |  2 --
 .../compilation_tests/ctest_mdspan_convertible.cpp     |  1 -
 .../compilation_tests/ctest_no_unique_address.cpp      |  2 --
 .../mdspan/compilation_tests/ctest_standard_layout.cpp |  3 ---
 .../compilation_tests/ctest_trivially_copyable.cpp     |  3 ---
 .../examples/tiled_layout/simple_tiled_layout.cpp      |  1 -
 .../experimental/__p0009_bits/aligned_accessor.hpp     |  2 +-
 .../include/experimental/__p0009_bits/extents.hpp      |  2 +-
 .../include/experimental/__p0009_bits/layout_left.hpp  |  1 -
 .../experimental/__p0009_bits/layout_padded.hpp        |  4 ++--
 .../include/experimental/__p0009_bits/layout_right.hpp |  1 -
 .../experimental/__p0009_bits/no_unique_address.hpp    |  4 ++--
 .../include/experimental/__p0009_bits/type_list.hpp    |  1 -
 .../thirdparty/mdspan/include/experimental/mdarray     |  1 -
 .../raft/thirdparty/mdspan/make_single_header.py       |  1 -
 .../raft/thirdparty/mdspan/tests/CMakeLists.txt        |  1 -
 .../mdspan/tests/test_exhaustive_layouts.cpp           |  1 -
 .../thirdparty/mdspan/tests/test_layout_stride.cpp     |  1 -
 .../thirdparty/mdspan/tests/test_mdarray_ctors.cpp     |  2 +-
 .../raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp |  2 +-
 cpp/include/raft/util/detail/popc.cuh                  |  2 +-
 cpp/include/raft/util/input_validation.hpp             |  2 +-
 cpp/include/raft/util/warp_primitives.cuh              |  2 +-
 cpp/scripts/run-clang-compile.py                       |  4 ++--
 cpp/scripts/run-clang-tidy.py                          |  4 ++--
 cpp/scripts/run-cmake-format.sh                        |  2 +-
 cpp/tests/linalg/cholesky_r1.cu                        |  2 +-
 cpp/tests/matrix/argmax.cu                             |  2 +-
 cpp/tests/matrix/argmin.cu                             |  2 +-
 cpp/tests/matrix/diagonal.cu                           |  2 +-
 cpp/tests/matrix/gather.cu                             |  2 +-
 cpp/tests/matrix/scatter.cu                            |  2 +-
 cpp/tests/mr/device/buffer.cpp                         |  2 +-
 cpp/tests/mr/host/buffer.cpp                           |  2 +-
 cpp/tests/neighbors/spatial_data.h                     |  2 +-
 cpp/tests/stats/weighted_mean.cu                       |  2 +-
 cpp/tests/test_utils.cuh                               |  2 +-
 docs/README.md                                         |  2 +-
 docs/source/_static/references.css                     |  2 +-
 docs/source/contributing.md                            |  2 --
 docs/source/cpp_api.rst                                |  2 +-
 docs/source/cpp_api/core.rst                           |  2 +-
 docs/source/cpp_api/core_bitmap.rst                    |  2 +-
 docs/source/cpp_api/core_bitset.rst                    |  2 +-
 docs/source/cpp_api/core_kvp.rst                       |  1 -
 docs/source/cpp_api/core_logger.rst                    |  1 -
 docs/source/cpp_api/core_nvtx.rst                      |  2 --
 docs/source/cpp_api/linalg.rst                         |  4 ++--
 docs/source/cpp_api/linalg_arithmetic.rst              |  1 -
 docs/source/cpp_api/linalg_matrix.rst                  |  1 -
 docs/source/cpp_api/linalg_matrix_vector.rst           |  1 -
 docs/source/cpp_api/matrix_manipulation.rst            |  1 -
 docs/source/cpp_api/matrix_reduction.rst               |  2 +-
 docs/source/cpp_api/mdspan_representation.rst          |  2 --
 docs/source/cpp_api/mdspan_span.rst                    |  1 -
 docs/source/cpp_api/mnmg.rst                           |  1 -
 docs/source/cpp_api/random.rst                         |  1 -
 docs/source/cpp_api/random_datagen.rst                 |  1 -
 .../cpp_api/random_sampling_without_replacement.rst    |  2 --
 docs/source/cpp_api/sparse.rst                         |  1 -
 docs/source/cpp_api/sparse_types_coo_matrix.rst        |  1 -
 docs/source/cpp_api/sparse_types_csr_matrix.rst        |  1 -
 docs/source/cpp_api/stats_classification.rst           |  1 -
 docs/source/cpp_api/stats_probability.rst              |  1 -
 docs/source/cpp_api/stats_regression.rst               |  2 --
 docs/source/pylibraft_api/random.rst                   |  2 +-
 docs/source/pylibraft_api/sparse.rst                   |  2 +-
 python/pylibraft/.coveragerc                           |  2 +-
 python/pylibraft/pylibraft/tests/pytest.ini            |  1 -
 python/raft-dask/.coveragerc                           |  2 +-
 python/raft-dask/raft_dask/tests/pytest.ini            |  1 -
 thirdparty/LICENSES/LICENSE.ann-benchmark              |  2 +-
 thirdparty/LICENSES/LICENSE.faiss                      |  2 +-
 thirdparty/LICENSES/LICENSE.pytorch                    |  2 +-
 thirdparty/LICENSES/mdarray.license                    |  2 +-
 201 files changed, 177 insertions(+), 226 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4e0cf53c4d..6dfcc72417 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,11 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 repos:
+      - repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v5.0.0
+        hooks:
+              - id: trailing-whitespace
+              - id: end-of-file-fixer
       - repo: https://github.com/PyCQA/isort
         rev: 5.12.0
         hooks:
diff --git a/build.sh b/build.sh
index de3ebfa3c5..8f388e549c 100755
--- a/build.sh
+++ b/build.sh
@@ -473,4 +473,3 @@ if hasArg docs; then
     cd ${SPHINX_BUILD_DIR}
     sphinx-build -b html source _html
 fi
-
diff --git a/cpp/cmake/patches/cutlass/build-export.patch b/cpp/cmake/patches/cutlass/build-export.patch
index a6423e9c08..31bbd25102 100644
--- a/cpp/cmake/patches/cutlass/build-export.patch
+++ b/cpp/cmake/patches/cutlass/build-export.patch
@@ -20,8 +20,7 @@ index 7419bdf5e..545384d82 100755
 -  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
 -  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
    )
- 
+
  # Mark CTK headers as system to supress warnings from them
--- 
+--
 2.34.1
-
diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh
index 97755351c4..f3e2c78584 100644
--- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh
@@ -227,4 +227,4 @@ void find_k(raft::resources const& handle,
                                                               n_iter);
   }
 }
-}  // namespace raft::cluster::detail
\ No newline at end of file
+}  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index 55becc8e15..2b77ca9963 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -204,4 +204,4 @@ void build_sorted_mst(
   raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream);
 }
 
-};  // namespace raft::cluster::detail
\ No newline at end of file
+};  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh
index ccc6472684..0a21271271 100644
--- a/cpp/include/raft/cluster/detail/single_linkage.cuh
+++ b/cpp/include/raft/cluster/detail/single_linkage.cuh
@@ -122,4 +122,4 @@ void single_linkage(raft::resources const& handle,
   out->n_leaves               = m;
   out->n_connected_components = 1;
 }
-};  // namespace raft::cluster::detail
\ No newline at end of file
+};  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
index 385bc544b0..1cd77ca665 100644
--- a/cpp/include/raft/common/nvtx.hpp
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -21,4 +21,4 @@
 
 #pragma once
 
-#include <raft/core/nvtx.hpp>
\ No newline at end of file
+#include <raft/core/nvtx.hpp>
diff --git a/cpp/include/raft/core/coo_matrix.hpp b/cpp/include/raft/core/coo_matrix.hpp
index 52ac69f163..b812e28206 100644
--- a/cpp/include/raft/core/coo_matrix.hpp
+++ b/cpp/include/raft/core/coo_matrix.hpp
@@ -297,4 +297,4 @@ class coo_matrix
 
 /** @} */
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/core/csr_matrix.hpp b/cpp/include/raft/core/csr_matrix.hpp
index 1113cc2023..4f7679bbae 100644
--- a/cpp/include/raft/core/csr_matrix.hpp
+++ b/cpp/include/raft/core/csr_matrix.hpp
@@ -309,4 +309,4 @@ class csr_matrix
 
 /** @} */
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/core/detail/mdspan_util.cuh b/cpp/include/raft/core/detail/mdspan_util.cuh
index ded95c2f31..d3438bc07d 100644
--- a/cpp/include/raft/core/detail/mdspan_util.cuh
+++ b/cpp/include/raft/core/detail/mdspan_util.cuh
@@ -67,4 +67,4 @@ MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t
 #endif  // compiler
 }
 
-}  // end namespace raft::detail
\ No newline at end of file
+}  // end namespace raft::detail
diff --git a/cpp/include/raft/core/device_coo_matrix.hpp b/cpp/include/raft/core/device_coo_matrix.hpp
index 41da605ff0..4ed67d5fc5 100644
--- a/cpp/include/raft/core/device_coo_matrix.hpp
+++ b/cpp/include/raft/core/device_coo_matrix.hpp
@@ -395,4 +395,4 @@ auto make_device_coordinate_structure_view(raft::device_span<RowType> rows,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/device_csr_matrix.hpp b/cpp/include/raft/core/device_csr_matrix.hpp
index 1d23c8912d..b0dbfa000d 100644
--- a/cpp/include/raft/core/device_csr_matrix.hpp
+++ b/cpp/include/raft/core/device_csr_matrix.hpp
@@ -422,4 +422,4 @@ auto make_device_compressed_structure_view(raft::device_span<IndptrType> indptr,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/device_span.hpp b/cpp/include/raft/core/device_span.hpp
index d3350b5e3a..abf72b6b2e 100644
--- a/cpp/include/raft/core/device_span.hpp
+++ b/cpp/include/raft/core/device_span.hpp
@@ -34,4 +34,4 @@ using device_span = span<T, true, extent>;
 /**
  * @}
  */
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/core/host_coo_matrix.hpp b/cpp/include/raft/core/host_coo_matrix.hpp
index 7a216dc8a2..e0f95d2a77 100644
--- a/cpp/include/raft/core/host_coo_matrix.hpp
+++ b/cpp/include/raft/core/host_coo_matrix.hpp
@@ -393,4 +393,4 @@ auto make_host_coordinate_structure_view(raft::host_span<RowType> rows,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/host_csr_matrix.hpp b/cpp/include/raft/core/host_csr_matrix.hpp
index e3cea3cd27..8a29d957f6 100644
--- a/cpp/include/raft/core/host_csr_matrix.hpp
+++ b/cpp/include/raft/core/host_csr_matrix.hpp
@@ -423,4 +423,4 @@ auto make_host_compressed_structure_view(raft::host_span<IndptrType> indptr,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp
index 3020cde32d..229619999d 100644
--- a/cpp/include/raft/core/host_mdarray.hpp
+++ b/cpp/include/raft/core/host_mdarray.hpp
@@ -253,4 +253,4 @@ auto make_host_vector(IndexType n)
   return make_host_mdarray<ElementType, IndexType, LayoutPolicy>(make_extents<IndexType>(n));
 }
 
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/core/host_span.hpp b/cpp/include/raft/core/host_span.hpp
index 36978dfca4..d31f8b4c30 100644
--- a/cpp/include/raft/core/host_span.hpp
+++ b/cpp/include/raft/core/host_span.hpp
@@ -35,4 +35,4 @@ using host_span = span<T, false, extent>;
  * @}
  */
 
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/core/resource/device_id.hpp b/cpp/include/raft/core/resource/device_id.hpp
index 570d815780..a371f9ddde 100644
--- a/cpp/include/raft/core/resource/device_id.hpp
+++ b/cpp/include/raft/core/resource/device_id.hpp
@@ -73,4 +73,4 @@ inline int get_device_id(resources const& res)
 /**
  * @}
  */
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/device_properties.hpp b/cpp/include/raft/core/resource/device_properties.hpp
index a87c29f709..7ac780ef16 100644
--- a/cpp/include/raft/core/resource/device_properties.hpp
+++ b/cpp/include/raft/core/resource/device_properties.hpp
@@ -75,4 +75,4 @@ inline cudaDeviceProp& get_device_properties(resources const& res)
 /**
  * @}
  */
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/sub_comms.hpp b/cpp/include/raft/core/resource/sub_comms.hpp
index 11d2aed1e0..b4fef75d57 100644
--- a/cpp/include/raft/core/resource/sub_comms.hpp
+++ b/cpp/include/raft/core/resource/sub_comms.hpp
@@ -79,4 +79,4 @@ inline void set_subcomm(resources const& res,
  * @}
  */
 
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
index 55da3037a9..6e5092f50f 100644
--- a/cpp/include/raft/core/sparse_types.hpp
+++ b/cpp/include/raft/core/sparse_types.hpp
@@ -222,4 +222,4 @@ class sparse_matrix {
 
 /* @} */
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h b/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h
index 42de4860a0..56cce4de8b 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h
@@ -406,4 +406,4 @@ struct FusedDistanceNNGemm<double,
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
\ No newline at end of file
+}  // namespace cutlass
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
index aaf2689dab..cc85a918a3 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
+++ b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
@@ -235,4 +235,4 @@ struct PairwiseDistanceGemm<double,
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
\ No newline at end of file
+}  // namespace cutlass
diff --git a/cpp/include/raft/distance/fused_distance_nn.cuh b/cpp/include/raft/distance/fused_distance_nn.cuh
index 25b1ae01ea..aa20bfeaf1 100755
--- a/cpp/include/raft/distance/fused_distance_nn.cuh
+++ b/cpp/include/raft/distance/fused_distance_nn.cuh
@@ -15,4 +15,4 @@
  */
 #pragma once
 
-#include "fused_distance_nn-inl.cuh"
\ No newline at end of file
+#include "fused_distance_nn-inl.cuh"
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 93c1080ff2..c539419738 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -118,4 +118,4 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer
 };  // namespace label
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
index 5513f16b9f..891bc9313a 100644
--- a/cpp/include/raft/label/detail/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -155,4 +155,4 @@ void merge_labels(value_idx* labels_a,
 
 }  // namespace detail
 };  // namespace label
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index 2bf2fa830b..370b6b8996 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -68,4 +68,4 @@ void merge_labels(value_idx* labels_a,
 };  // namespace label
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 292140b4dc..e938626b20 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -139,4 +139,4 @@ void choleskyRank1Update(raft::resources const& handle,
 };  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index a4247e618f..b377bad101 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -163,4 +163,4 @@ void coalesced_reduction(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 121ac10e24..0227fea4a4 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -60,4 +60,4 @@ void addDevScalar(
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index d23c7d60a6..2b9a7ba485 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -99,4 +99,4 @@ void divide_scalar(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 9f03f54f9a..7245d31191 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -223,4 +223,4 @@ void eig_jacobi(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 2e6c1a4ab5..569845d488 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -97,4 +97,4 @@ void eltwiseDivideCheckZero(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
index 31bad62930..6b33561f48 100644
--- a/cpp/include/raft/linalg/gemv.cuh
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -307,4 +307,4 @@ void gemv(raft::resources const& handle,
 
 };  // namespace linalg
 };  // namespace raft
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/linalg_types.hpp b/cpp/include/raft/linalg/linalg_types.hpp
index 9c81fbc177..aa3e402988 100644
--- a/cpp/include/raft/linalg/linalg_types.hpp
+++ b/cpp/include/raft/linalg/linalg_types.hpp
@@ -39,4 +39,4 @@ enum class FillMode { UPPER, LOWER };
  */
 enum class Operation { NON_TRANSPOSE, TRANSPOSE };
 
-}  // end namespace raft::linalg
\ No newline at end of file
+}  // end namespace raft::linalg
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
index 21575d7806..5188e69268 100644
--- a/cpp/include/raft/linalg/lstsq.cuh
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -248,4 +248,4 @@ void lstsq_qr(raft::resources const& handle,
 };  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh
index 1886c941b9..505aade1cf 100644
--- a/cpp/include/raft/linalg/map_reduce.cuh
+++ b/cpp/include/raft/linalg/map_reduce.cuh
@@ -115,4 +115,4 @@ void map_reduce(raft::resources const& handle,
 
 }  // end namespace raft::linalg
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index a69ac6df36..f4ab356f1c 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -91,4 +91,4 @@ template <typename InType,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
index 85805c287a..ab8a3dbd04 100644
--- a/cpp/include/raft/linalg/matrix_vector.cuh
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -200,4 +200,4 @@ void binary_sub(raft::resources const& handle,
 
 /** @} */  // end of matrix_vector
 
-}  // namespace raft::linalg
\ No newline at end of file
+}  // namespace raft::linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 1a7668f8f2..f01af3b700 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -101,4 +101,4 @@ void multiply_scalar(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 5c7dcbd5cf..5f319a7537 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -157,4 +157,4 @@ void power_scalar(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index b032cbfa3a..ce07baea1f 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -123,4 +123,4 @@ void qr_get_qr(raft::resources const& handle,
 };  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 8fd6e45d37..7f9ec0c197 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -170,4 +170,4 @@ void reduce(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 77ad8a9a80..e0f6fe257c 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -116,4 +116,4 @@ void reduce_cols_by_key(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 2bb14729f4..edb325acc1 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -195,4 +195,4 @@ void reduce_rows_by_key(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index a90ba165ed..404e8e2dd9 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -880,4 +880,4 @@ void randomized_svd(const raft::resources& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index 81b7ab7dec..2dfa1ccd91 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -87,4 +87,4 @@ void sqrt(raft::resources const& handle, InType in, OutType out)
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index ac97c3cd68..c283d5721a 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -174,4 +174,4 @@ void strided_reduction(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index e7b5c6b65a..dc59d955ad 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -226,4 +226,4 @@ void subtract_scalar(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 1c57515a47..11f700c2a9 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -420,4 +420,4 @@ void svd_reconstruction(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index c94b2506d3..9e3b806702 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -136,4 +136,4 @@ void sort_cols_per_row(Args... args)
 
 };  // end namespace raft::matrix
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh
index a37ba550f9..6fa6ce4aee 100644
--- a/cpp/include/raft/matrix/detail/gather_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh
@@ -114,4 +114,4 @@ void gather(raft::resources const& handle,
 
 }  // namespace detail
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
index 6a60e5006b..c00d212c10 100644
--- a/cpp/include/raft/matrix/detail/scatter_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
@@ -126,4 +126,4 @@ void scatter(raft::resources const& handle,
 
 }  // end namespace detail
 }  // end namespace matrix
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index 6ed9a0d358..10a9f66ae3 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -20,4 +20,4 @@
 
 #pragma once
 
-#include "math.cuh"
\ No newline at end of file
+#include "math.cuh"
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index ecfdb19191..8397f94a8d 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -41,4 +41,4 @@ m_t l2_norm(raft::resources const& handle, raft::device_mdspan<const m_t, idx_t>
 
 /** @} */  // end of group matrix_norm
 
-}  // namespace raft::matrix
\ No newline at end of file
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 42057bb0f5..c10fa8f5f0 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -69,4 +69,4 @@ void row_reverse(raft::resources const& handle,
 }
 /** @} */  // end group matrix_reverse
 
-}  // namespace raft::matrix
\ No newline at end of file
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/scatter.cuh b/cpp/include/raft/matrix/scatter.cuh
index cd2d76a863..072f0c18ac 100644
--- a/cpp/include/raft/matrix/scatter.cuh
+++ b/cpp/include/raft/matrix/scatter.cuh
@@ -55,4 +55,4 @@ void scatter(raft::resources const& handle,
   detail::scatter(handle, inout, map, col_batch_size);
 }
 
-}  // namespace raft::matrix
\ No newline at end of file
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
index c922a0d7f4..caff6ea341 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
@@ -228,4 +228,4 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   }
 };
 
-}  // namespace raft::neighbors::cagra::detail
\ No newline at end of file
+}  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/div_utils.hpp b/cpp/include/raft/neighbors/detail/div_utils.hpp
index 0455d0ec9b..4dd7b66d46 100644
--- a/cpp/include/raft/neighbors/detail/div_utils.hpp
+++ b/cpp/include/raft/neighbors/detail/div_utils.hpp
@@ -63,4 +63,4 @@ struct div_utils {
 #endif
   }
 };
-}  // namespace raft::neighbors::detail
\ No newline at end of file
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
index 5379788ab4..db03d78105 100644
--- a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
@@ -87,4 +87,4 @@ _RAFT_HOST_DEVICE void unpack_1(
     }
   }
 }
-}  // namespace raft::neighbors::ivf_flat::codepacker
\ No newline at end of file
+}  // namespace raft::neighbors::ivf_flat::codepacker
diff --git a/cpp/include/raft/random/detail/curand_wrappers.hpp b/cpp/include/raft/random/detail/curand_wrappers.hpp
index 969d739cc1..d62e64d532 100644
--- a/cpp/include/raft/random/detail/curand_wrappers.hpp
+++ b/cpp/include/raft/random/detail/curand_wrappers.hpp
@@ -54,4 +54,4 @@ inline curandStatus_t curandGenerateNormal(
 /** @} */
 
 };  // end namespace detail
-};  // end namespace raft::random
\ No newline at end of file
+};  // end namespace raft::random
diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh
index 37caa51ad3..b1c56afa0c 100644
--- a/cpp/include/raft/random/detail/permute.cuh
+++ b/cpp/include/raft/random/detail/permute.cuh
@@ -161,4 +161,4 @@ void permute(IntType* perms,
 }
 
 };  // end namespace detail
-};  // end namespace raft::random
\ No newline at end of file
+};  // end namespace raft::random
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 4fd1f44f64..296b7ab283 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -187,4 +187,4 @@ void make_blobs(
 
 }  // end namespace raft::random
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
index fad1d4adfa..6e3d63ab9f 100644
--- a/cpp/include/raft/random/sample_without_replacement.cuh
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -166,4 +166,4 @@ void sample_without_replacement(Args... args)
 
 /** @} */
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace raft::random
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 7ee0f5fbc3..2357c56422 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -331,4 +331,4 @@ class LinearAssignmentProblem {
 
 }  // namespace raft::solver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index b5568ef7d9..ba3efc7ff0 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -43,4 +43,4 @@ void csr_to_coo(
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
index a146113a86..6613049f25 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -64,4 +64,4 @@ void csr_to_dense(cusparseHandle_t handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index 0a498bb1ca..469dac3c86 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -76,4 +76,4 @@ void csr_to_coo(
 };  // end NAMESPACE detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index e60e494d34..ec3d0ec1c3 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -141,4 +141,4 @@ void csr_to_dense(cusparseHandle_t handle,
 };  // namespace detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
index e7d81f51aa..d5262581a3 100644
--- a/cpp/include/raft/sparse/detail/cusparse_macros.h
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -20,4 +20,4 @@
 
 #pragma once
 
-#include <raft/core/cusparse_macros.hpp>
\ No newline at end of file
+#include <raft/core/cusparse_macros.hpp>
diff --git a/cpp/include/raft/sparse/distance/detail/common.hpp b/cpp/include/raft/sparse/distance/detail/common.hpp
index 0f463dac80..19fe9c1786 100644
--- a/cpp/include/raft/sparse/distance/detail/common.hpp
+++ b/cpp/include/raft/sparse/distance/detail/common.hpp
@@ -56,4 +56,4 @@ class distances_t {
 };  // namespace detail
 };  // namespace distance
 };  // namespace sparse
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 38aa106d78..59cfcfa186 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -229,4 +229,4 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
 }  // namespace detail
 }  // namespace distance
 }  // namespace sparse
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
index 5a1c152bd0..4a075cf530 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -116,4 +116,4 @@ class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
 }  // namespace detail
 }  // namespace distance
 }  // namespace sparse
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index ead44f0c51..5bcd1ff005 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -221,4 +221,4 @@ void pairwise_distance(raft::resources const& handle,
 };  // namespace sparse
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index def305afb2..a97b935f58 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -96,4 +96,4 @@ void csr_add_finalize(const int* a_ind,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 57c9b986b4..8ac97259da 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -120,4 +120,4 @@ void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 3702111f83..2619048388 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -232,4 +232,4 @@ void rowNormCsrCaller(const IdxType* ia,
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index 3a646b9a6e..579ee88d38 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -107,4 +107,4 @@ void csr_transpose(cusparseHandle_t handle,
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 43dd182fe5..7adf245abc 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -104,4 +104,4 @@ void rowNormCsr(raft::resources const& handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 4c0595bf91..276a64c125 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -40,4 +40,4 @@ void fit_embedding(raft::resources const& handle,
 };  // namespace sparse
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 1de8d5b426..8ee53cd3ae 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -165,4 +165,4 @@ void symmetrize(raft::resources const& handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index 4333060ad9..304cbf4936 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -68,4 +68,4 @@ void csr_transpose(raft::resources const& handle,
 
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh
index c94c6254c3..ed4aa4c98f 100644
--- a/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh
@@ -96,4 +96,4 @@ void cross_component_nn(
                              metric);
 }
 
-};  // end namespace raft::sparse::neighbors
\ No newline at end of file
+};  // end namespace raft::sparse::neighbors
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index c64c05ae4e..4b329325ca 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -91,4 +91,4 @@ void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 52f1d3b239..b03192f111 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -84,4 +84,4 @@ void max_duplicates(raft::resources const& handle,
 };  // END namespace sparse
 };  // END namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index a799093226..b8d5a49d9f 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -45,4 +45,4 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh
index 2da6dad4fc..e8a456d23e 100644
--- a/cpp/include/raft/sparse/op/slice.cuh
+++ b/cpp/include/raft/sparse/op/slice.cuh
@@ -78,4 +78,4 @@ void csr_row_slice_populate(value_idx start_offset,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/solver/lanczos.cuh b/cpp/include/raft/sparse/solver/lanczos.cuh
index fed31e6a9c..4c45a28cc6 100644
--- a/cpp/include/raft/sparse/solver/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/lanczos.cuh
@@ -230,4 +230,4 @@ int computeLargestEigenvectors(
 
 }  // namespace raft::sparse::solver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/cluster_solvers.cuh b/cpp/include/raft/spectral/cluster_solvers.cuh
index b693ac4af3..c273808cf8 100644
--- a/cpp/include/raft/spectral/cluster_solvers.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers.cuh
@@ -97,4 +97,4 @@ struct kmeans_solver_t {
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
index 40b0324548..139df1d27f 100644
--- a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
@@ -87,4 +87,4 @@ struct kmeans_solver_deprecated_t {
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
index ab1398a2a1..6514f7ef21 100644
--- a/cpp/include/raft/spectral/modularity_maximization.cuh
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -83,4 +83,4 @@ void analyzeModularity(raft::resources const& handle,
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index f7ea456ac5..a2ac328aa1 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -92,4 +92,4 @@ void analyzePartition(raft::resources const& handle,
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
index 6625d38a7a..0b352e185b 100644
--- a/cpp/include/raft/stats/accuracy.cuh
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -75,4 +75,4 @@ float accuracy(raft::resources const& handle,
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 1f97cd5f76..6822e069a2 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -86,4 +86,4 @@ double adjusted_rand_index(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index b669e0de32..f4667b37dc 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -88,4 +88,4 @@ double completeness_score(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index 16f0998435..03fa0d4924 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -214,4 +214,4 @@ void contingency_matrix(Args... args)
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
index ad5d233c0e..096ec4bc1c 100644
--- a/cpp/include/raft/stats/cov.cuh
+++ b/cpp/include/raft/stats/cov.cuh
@@ -119,4 +119,4 @@ void cov(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index ee39c87a68..a7d4f2b877 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -47,4 +47,4 @@ void mean(
 
 }  // namespace detail
 }  // namespace stats
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 4c861b49fb..c758584ec9 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -120,4 +120,4 @@ void vars(Type* var,
 
 }  // namespace detail
 }  // namespace stats
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 39bd2c3b6c..4f5438b133 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -34,4 +34,4 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
 
 }  // namespace detail
 }  // namespace stats
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index ada0995f7d..9b96ed5949 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -72,4 +72,4 @@ void weightedMean(Type* mu,
 }
 };  // end namespace detail
 };  // end namespace stats
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
index ded7c8178b..444cc04bca 100644
--- a/cpp/include/raft/stats/dispersion.cuh
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -131,4 +131,4 @@ value_t cluster_dispersion(
 }  // end namespace stats
 }  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index fe432569ee..a0c6ae5bdb 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -83,4 +83,4 @@ double entropy(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index 311cd599f8..3095d2c724 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -91,4 +91,4 @@ double homogeneity_score(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 43d39cfd6c..bc3cf184c6 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -96,4 +96,4 @@ void mean(raft::resources const& handle,
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 83f9a8a941..fb9da4dd39 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -163,4 +163,4 @@ void mean_add(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index d2c410dab1..930a6f8b9e 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -141,4 +141,4 @@ void minmax(raft::resources const& handle,
 
 };  // namespace stats
 };  // namespace raft
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 5a334e9280..c895a911e9 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -89,4 +89,4 @@ double mutual_info_score(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index c98b4bc93a..4ff9f491d8 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -90,4 +90,4 @@ value_t r2_score(raft::resources const& handle,
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index a21a0c0dc5..1230d615eb 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -75,4 +75,4 @@ double rand_index(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index 718170f716..74763de2fc 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -104,4 +104,4 @@ void regression_metrics(raft::resources const& handle,
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
index 23eef84604..15d86969af 100644
--- a/cpp/include/raft/stats/silhouette_score.cuh
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -223,4 +223,4 @@ value_t silhouette_score_batched(
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 0a67bd2325..62668b3ddd 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -185,4 +185,4 @@ void vars(raft::resources const& handle,
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 2c3ed1b83e..6c18a21988 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -88,4 +88,4 @@ void sum(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/trustworthiness_score.cuh b/cpp/include/raft/stats/trustworthiness_score.cuh
index 3f4464f4d3..2435cb4ef9 100644
--- a/cpp/include/raft/stats/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/trustworthiness_score.cuh
@@ -98,4 +98,4 @@ double trustworthiness_score(
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index 041adb5e38..1df3eab460 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -95,4 +95,4 @@ double v_measure(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index da22f0163c..a3e38f7168 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -189,4 +189,4 @@ void col_weighted_mean(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml b/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml
index a5411082af..4357c207a1 100644
--- a/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml
+++ b/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml
@@ -37,27 +37,27 @@ jobs:
 
     - name: Create Build Environment
       run: cmake -E make_directory ${{github.workspace}}/mdspan-build
-      
+
     - name: Check Out
       uses: actions/checkout@v2
       with:
         path: ${{github.workspace}}/mdspan-src
-      
+
     - name: Configure CMake
       shell: bash
       working-directory: ${{github.workspace}}/mdspan-build
       run: CXX=${{ matrix.compiler_prefix}}/${{ matrix.compiler_driver }} cmake $GITHUB_WORKSPACE/mdspan-src -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/mdspan-install -DMDSPAN_ENABLE_TESTS=ON -DMDSPAN_ENABLE_EXAMPLES=ON
-      
+
     - name: Build
       shell: bash
       working-directory: ${{github.workspace}}/mdspan-build
       run: make -j
-      
+
     - name: Test
       working-directory: ${{github.workspace}}/mdspan-build
       shell: bash
       run: ctest
-            
+
     - name: Install
       shell: bash
       working-directory: ${{github.workspace}}/mdspan-build
diff --git a/cpp/include/raft/thirdparty/mdspan/LICENSE b/cpp/include/raft/thirdparty/mdspan/LICENSE
index c68a8a2a9f..db92c208da 100644
--- a/cpp/include/raft/thirdparty/mdspan/LICENSE
+++ b/cpp/include/raft/thirdparty/mdspan/LICENSE
@@ -1,14 +1,14 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 //
 // Kokkos is licensed under 3-clause BSD terms of use:
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -37,6 +37,6 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
diff --git a/cpp/include/raft/thirdparty/mdspan/README.md b/cpp/include/raft/thirdparty/mdspan/README.md
index a062777261..15af4dd4a9 100644
--- a/cpp/include/raft/thirdparty/mdspan/README.md
+++ b/cpp/include/raft/thirdparty/mdspan/README.md
@@ -70,4 +70,3 @@ Acknowledgements
 ================
 
 This work was undertaken as part of the [Kokkos project](https://github.com/kokkos/kokkos) at Sandia National Laboratories.  Sandia National Laboratories is a multimission laboratory managed and operated by National Technology & Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International Inc., for the U. S. Department of Energy's National Nuclear Security Administration under contract DE-NA0003525.
-
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt
index 30391b3d70..3d5cbb955a 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt
@@ -2,4 +2,4 @@
 mdspan_add_cuda_benchmark(sum_3d_cuda)
 target_include_directories(sum_3d_cuda PUBLIC
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/benchmarks/sum>
-)
\ No newline at end of file
+)
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt
index 566c47c9ab..ccab58bfa1 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt
@@ -4,4 +4,4 @@ if(OpenMP_CXX_FOUND)
   target_include_directories(sum_3d_openmp PUBLIC
       $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/benchmarks/sum>
   )
-endif()
\ No newline at end of file
+endif()
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp
index 9ab6a0ddf4..ef75349925 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp
@@ -174,4 +174,3 @@ BENCHMARK_CAPTURE(
 //================================================================================
 
 BENCHMARK_MAIN();
-
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp
index f106e2f5ff..4cbfe029c7 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp
@@ -223,4 +223,3 @@ BENCHMARK_CAPTURE(
 //================================================================================
 
 BENCHMARK_MAIN();
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp
index ea2bad164c..ef45c9d18f 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp
@@ -169,4 +169,3 @@ test<CP<CP<int*, int*>, CP<int*, int*>>, 4 * sizeof(int*), non_empty>();
 // </editor-fold> end compressed pair layout: 2 nested pairs, 4 leaf elements }}}1
 //==============================================================================
 }
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp
index 00126691aa..64d71d650c 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp
@@ -176,4 +176,3 @@ MDSPAN_STATIC_TEST(
     stdex::extents<size_t,stdex::dynamic_extent, stdex::dynamic_extent, stdex::dynamic_extent>
   >::value
 );
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp
index e293734444..fc30fa25e5 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp
@@ -117,5 +117,3 @@ MDSPAN_STATIC_TEST(
 MDSPAN_STATIC_TEST(
   !std::is_constructible<LS1, NotARealLayout::mapping<E2>>::value
 );
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp
index fa1136b9d6..c64fcdbabd 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp
@@ -68,4 +68,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end mdspan }}}1
 //==============================================================================
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp
index 9f7c6c052d..c44b02bf76 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp
@@ -109,5 +109,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end layouts }}}1
 //==============================================================================
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
index d8edf31ab2..6e41433d6a 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
@@ -216,6 +216,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end mdspan }}}1
 //==============================================================================
-
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp
index 73ab426afa..f6457234d7 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp
@@ -212,6 +212,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end mdspan }}}1
 //==============================================================================
-
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp b/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp
index b8740d5227..ba481c3144 100644
--- a/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp
@@ -207,4 +207,3 @@ int main() {
     std::cout << "Success! SimpleTiledLayout2D works as expected." << std::endl;
   }
 }
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
index 67356785c0..02e386e3aa 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
@@ -42,7 +42,7 @@
 */
 
 
-// NOTE: This code is prematurely taken from an example based on 
+// NOTE: This code is prematurely taken from an example based on
 // https://github.com/kokkos/mdspan/pull/176
 
 #pragma once
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp
index 6be71b432c..3b4d69d63e 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp
@@ -531,7 +531,7 @@ struct __extents_to_partially_static_sizes;
 template <class IndexType, size_t... ExtentsPack>
 struct __extents_to_partially_static_sizes<::std::experimental::extents<IndexType, ExtentsPack...>> {
   using type = detail::__partially_static_sizes<
-          typename ::std::experimental::extents<IndexType, ExtentsPack...>::index_type, size_t, 
+          typename ::std::experimental::extents<IndexType, ExtentsPack...>::index_type, size_t,
           ExtentsPack...>;
 };
 
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
index ed1478dc8b..92a291e915 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
@@ -237,4 +237,3 @@ class layout_left::mapping {
 
 } // end namespace experimental
 } // end namespace std
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
index cd9c9c19bf..c761146874 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
@@ -45,7 +45,7 @@
 // NOTE: This code is prematurely taken from https://github.com/kokkos/mdspan/pull/180
 // and matches requirements described in https://github.com/ORNL/cpp-proposals-pub/pull/296
 // Some parts (as submdspan integration) are missing
-// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a 
+// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a
 // fixed stride to a padding alignment, allowing dimensions > padding_stride to be padded
 // to multiples of 'padding_stride'
 
@@ -140,7 +140,7 @@ namespace details {
 // layout_padded_left implementation
 
 namespace details {
-   
+
 
   // The *_helper functions work around not having C++20
   // templated lambdas: []<size_t... TrailingIndices>{} .
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
index a9b64ca36a..d4b71efae1 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
@@ -237,4 +237,3 @@ class layout_right::mapping {
 
 } // end namespace experimental
 } // end namespace std
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
index 904dd40a75..90b1a46288 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
@@ -74,13 +74,13 @@ struct __no_unique_address_emulation<
                 // If the type isn't trivially destructible, its destructor
                 // won't be called at the right time, so don't use this
                 // specialization
-                _MDSPAN_TRAIT(is_trivially_destructible, _T)>> : 
+                _MDSPAN_TRAIT(is_trivially_destructible, _T)>> :
 #ifdef _MDSPAN_COMPILER_MSVC
     // MSVC doesn't allow you to access public static member functions of a type
     // when you *happen* to privately inherit from that type.
     protected
 #else
-    // But we still want this to be private if possible so that we don't accidentally 
+    // But we still want this to be private if possible so that we don't accidentally
     // access members of _T directly rather than calling __ref() first, which wouldn't
     // work if _T happens to be stateful and thus we're using the unspecialized definition
     // of __no_unique_address_emulation above.
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp
index 7de72e6537..64845190ae 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp
@@ -114,4 +114,3 @@ struct __type_at<3, __type_list<_T0, _T1, _T2, _T3, _Ts...>> {
 
 } // end namespace experimental
 } // end namespace std
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray
index fa710a59b6..60e06dd68e 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray
@@ -45,4 +45,3 @@
 
 #include "mdspan"
 #include "__p1684_bits/mdarray.hpp"
-
diff --git a/cpp/include/raft/thirdparty/mdspan/make_single_header.py b/cpp/include/raft/thirdparty/mdspan/make_single_header.py
index 1b562c7176..98ab3526db 100755
--- a/cpp/include/raft/thirdparty/mdspan/make_single_header.py
+++ b/cpp/include/raft/thirdparty/mdspan/make_single_header.py
@@ -49,4 +49,3 @@ def process_file(file_path, out_lines=[], front_matter_lines=[], back_matter_lin
        "#define _MDSPAN_SINGLE_HEADER_INCLUDE_GUARD_\n"],
       ["#endif // _MDSPAN_SINGLE_HEADER_INCLUDE_GUARD_\n"],
       [abspath(sys.argv[1])]))
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt b/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt
index d92834beb7..a30ce2c198 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt
+++ b/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt
@@ -57,4 +57,3 @@ mdspan_add_test(test_layout_ctors)
 mdspan_add_test(test_layout_stride)
 mdspan_add_test(test_submdspan)
 mdspan_add_test(test_mdarray_ctors)
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp
index f09b799684..e91896c1c4 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp
@@ -424,4 +424,3 @@ TYPED_TEST(TestLayoutConversion, implicit_conversion) {
     ASSERT_EQ(map1.stride(r), map2.stride(r));
   }
 }
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp
index 3a3e1c2696..12008f05cf 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp
@@ -164,4 +164,3 @@ TEST(TestLayoutStrideCTAD, test_ctad) {
 */
 }
 #endif
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp
index 781a12a697..3dcb61d454 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp
@@ -740,7 +740,7 @@ TEST(TestMdarrayCTAD, layout_stride) {
   ASSERT_EQ(m0.stride(1), 128);
   ASSERT_FALSE(m0.is_exhaustive());
 
-  /* 
+  /*
   stdex::mdarray m1{d.data(), stdex::layout_stride::mapping{stdex::extents{16, 32}, stdex::extents{1, 128}}};
   ASSERT_EQ(m1.data(), d.data());
   ASSERT_EQ(m1.rank(), 2);
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp
index 81d3fdb983..14ae51a259 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp
@@ -346,7 +346,7 @@ TEST(TestMdspanCTAD, layout_stride) {
   ASSERT_EQ(m0.stride(1), 128);
   ASSERT_FALSE(m0.is_exhaustive());
 
-  /* 
+  /*
   stdex::mdspan m1{d.data(), stdex::layout_stride::mapping{stdex::extents{16, 32}, stdex::extents{1, 128}}};
   ASSERT_EQ(m1.data(), d.data());
   ASSERT_EQ(m1.rank(), 2);
diff --git a/cpp/include/raft/util/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh
index f335be6fd0..9638a261a5 100644
--- a/cpp/include/raft/util/detail/popc.cuh
+++ b/cpp/include/raft/util/detail/popc.cuh
@@ -73,4 +73,4 @@ void popc(const raft::resources& res,
     });
 }
 
-}  // end namespace raft::detail
\ No newline at end of file
+}  // end namespace raft::detail
diff --git a/cpp/include/raft/util/input_validation.hpp b/cpp/include/raft/util/input_validation.hpp
index 17bb53f22b..119fd9d2e2 100644
--- a/cpp/include/raft/util/input_validation.hpp
+++ b/cpp/include/raft/util/input_validation.hpp
@@ -129,4 +129,4 @@ constexpr bool is_scalar_view(mdspan<ElementType, Extents> m)
   return false;
 }
 
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/util/warp_primitives.cuh b/cpp/include/raft/util/warp_primitives.cuh
index 953c137cdf..2a7c4e9127 100644
--- a/cpp/include/raft/util/warp_primitives.cuh
+++ b/cpp/include/raft/util/warp_primitives.cuh
@@ -256,4 +256,4 @@ DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_xor(T val,
   return output;
 }
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/scripts/run-clang-compile.py b/cpp/scripts/run-clang-compile.py
index 123f0e4075..8ed9aa00f0 100644
--- a/cpp/scripts/run-clang-compile.py
+++ b/cpp/scripts/run-clang-compile.py
@@ -253,12 +253,12 @@ def run_clang_command(clang_cmd, cwd):
 class LockContext(object):
     def __init__(self, lock=None) -> None:
         self._lock = lock
-    
+
     def __enter__(self):
         if self._lock:
             self._lock.acquire()
         return self
-    
+
     def __exit__(self, _, __, ___):
         if self._lock:
             self._lock.release()
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index 3d8bbcec4a..cad08ca551 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -296,12 +296,12 @@ def run_clang_tidy_command(tidy_cmd, cwd):
 class LockContext(object):
     def __init__(self, lock=None) -> None:
         self._lock = lock
-    
+
     def __enter__(self):
         if self._lock:
             self._lock.acquire()
         return self
-    
+
     def __exit__(self, _, __, ___):
         if self._lock:
             self._lock.release()
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index db5a8b5804..e08481fbd6 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -17,7 +17,7 @@
 # and exits gracefully if the file is not found. If a user wishes to specify a
 # config file at a nonstandard location, they may do so by setting the
 # environment variable RAPIDS_CMAKE_FORMAT_FILE.
-# 
+#
 # This script can be invoked directly anywhere within the project repository.
 # Alternatively, it may be invoked as a pre-commit hook via
 # `pre-commit run (cmake-format)|(cmake-lint)`.
diff --git a/cpp/tests/linalg/cholesky_r1.cu b/cpp/tests/linalg/cholesky_r1.cu
index f87e07402f..e506c89a79 100644
--- a/cpp/tests/linalg/cholesky_r1.cu
+++ b/cpp/tests/linalg/cholesky_r1.cu
@@ -170,4 +170,4 @@ TYPED_TEST(CholeskyR1Test, update) { this->testR1Update(); }
 TYPED_TEST(CholeskyR1Test, throwError) { this->testR1Error(); }
 
 };  // namespace linalg
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/tests/matrix/argmax.cu b/cpp/tests/matrix/argmax.cu
index cb3fd4a3fb..c0cf85cd38 100644
--- a/cpp/tests/matrix/argmax.cu
+++ b/cpp/tests/matrix/argmax.cu
@@ -110,4 +110,4 @@ INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestF, ::testing::ValuesIn(inputsf));
 INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/tests/matrix/argmin.cu b/cpp/tests/matrix/argmin.cu
index 060b4a78db..f0cacacf3a 100644
--- a/cpp/tests/matrix/argmin.cu
+++ b/cpp/tests/matrix/argmin.cu
@@ -110,4 +110,4 @@ INSTANTIATE_TEST_SUITE_P(ArgMinTest, ArgMinTestF, ::testing::ValuesIn(inputsf));
 INSTANTIATE_TEST_SUITE_P(ArgMinTest, ArgMinTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/tests/matrix/diagonal.cu b/cpp/tests/matrix/diagonal.cu
index c6e1f1a0d2..0a1f2af825 100644
--- a/cpp/tests/matrix/diagonal.cu
+++ b/cpp/tests/matrix/diagonal.cu
@@ -116,4 +116,4 @@ INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestF, ::testing::ValuesIn(inputs
 INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/tests/matrix/gather.cu b/cpp/tests/matrix/gather.cu
index 4c13d0c1e9..f62805b2b8 100644
--- a/cpp/tests/matrix/gather.cu
+++ b/cpp/tests/matrix/gather.cu
@@ -246,4 +246,4 @@ GATHER_TEST((GatherTest<false, false, true, float, uint32_t, int64_t>),
 GATHER_TEST((GatherTest<false, false, true, float, int64_t, int64_t>),
             GatherInplaceTestFI64I64,
             inplace_inputs_i64);
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/tests/matrix/scatter.cu b/cpp/tests/matrix/scatter.cu
index 7f478c7b93..f539b9759a 100644
--- a/cpp/tests/matrix/scatter.cu
+++ b/cpp/tests/matrix/scatter.cu
@@ -140,4 +140,4 @@ const std::vector<ScatterInputs<int64_t>> inputs_i64 =
 
 SCATTER_TEST((ScatterTest<float, int>), ScatterTestFI32, inputs_i32);
 SCATTER_TEST((ScatterTest<float, int64_t>), ScatterTestFI64, inputs_i64);
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/tests/mr/device/buffer.cpp b/cpp/tests/mr/device/buffer.cpp
index d14aa09b7a..3d5652a591 100644
--- a/cpp/tests/mr/device/buffer.cpp
+++ b/cpp/tests/mr/device/buffer.cpp
@@ -92,4 +92,4 @@ TEST(Raft, DeviceBufferZeroResize)
 
 }  // namespace device
 }  // namespace mr
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/tests/mr/host/buffer.cpp b/cpp/tests/mr/host/buffer.cpp
index 5688ff6376..792160eb89 100644
--- a/cpp/tests/mr/host/buffer.cpp
+++ b/cpp/tests/mr/host/buffer.cpp
@@ -69,4 +69,4 @@ TEST(Raft, DeviceToHostBuffer)
 
 }  // namespace host
 }  // namespace mr
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/tests/neighbors/spatial_data.h b/cpp/tests/neighbors/spatial_data.h
index d71b47cf1e..b4352f706d 100644
--- a/cpp/tests/neighbors/spatial_data.h
+++ b/cpp/tests/neighbors/spatial_data.h
@@ -35,4 +35,4 @@ std::vector<float> spatial_data = {
   31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
   47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/tests/stats/weighted_mean.cu b/cpp/tests/stats/weighted_mean.cu
index 407f3f14ea..e125fbc71e 100644
--- a/cpp/tests/stats/weighted_mean.cu
+++ b/cpp/tests/stats/weighted_mean.cu
@@ -340,4 +340,4 @@ TEST_P(WeightedMeanTestD, Result)
 INSTANTIATE_TEST_CASE_P(WeightedMeanTest, WeightedMeanTestD, ::testing::ValuesIn(inputsd));
 
 };  // end namespace stats
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/tests/test_utils.cuh b/cpp/tests/test_utils.cuh
index 810a0d7985..ac4ed4d24e 100644
--- a/cpp/tests/test_utils.cuh
+++ b/cpp/tests/test_utils.cuh
@@ -330,4 +330,4 @@ inline std::vector<float> read_csv(std::string filename, bool skip_first_n_colum
   return result;
 }
 
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/docs/README.md b/docs/README.md
index a09ccf41eb..aa5e114347 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -11,4 +11,4 @@ bash build.sh docs
 #### Once the process finishes, documentation can be found in build/html
 ```shell script
 xdg-open build/html/index.html`
-```
\ No newline at end of file
+```
diff --git a/docs/source/_static/references.css b/docs/source/_static/references.css
index 225cf13ba9..d1f647233a 100644
--- a/docs/source/_static/references.css
+++ b/docs/source/_static/references.css
@@ -20,4 +20,4 @@ dl.citation > dt.label > span::before {
 /* Add closing bracket */
 dl.citation > dt.label > span::after {
   content: "]";
-}
\ No newline at end of file
+}
diff --git a/docs/source/contributing.md b/docs/source/contributing.md
index 1b4071d0a5..446e7b2a7b 100755
--- a/docs/source/contributing.md
+++ b/docs/source/contributing.md
@@ -89,5 +89,3 @@ implementation of the issue, ask them in the issue instead of the PR.
 
 ## Attribution
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
-
-
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 74f706bf46..837cfa0cb0 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -16,4 +16,4 @@ C++ API
    cpp_api/solver.rst
    cpp_api/sparse.rst
    cpp_api/stats.rst
-   cpp_api/utils.rst
\ No newline at end of file
+   cpp_api/utils.rst
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 4122a18506..f159c85af8 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -22,4 +22,4 @@ expose in public APIs.
    core_operators.rst
    core_math.rst
    core_bitset.rst
-   core_bitmap.rst
\ No newline at end of file
+   core_bitmap.rst
diff --git a/docs/source/cpp_api/core_bitmap.rst b/docs/source/cpp_api/core_bitmap.rst
index 6c1dc607bf..532da58e71 100644
--- a/docs/source/cpp_api/core_bitmap.rst
+++ b/docs/source/cpp_api/core_bitmap.rst
@@ -12,4 +12,4 @@ namespace *raft::core*
 .. doxygengroup:: bitmap
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
diff --git a/docs/source/cpp_api/core_bitset.rst b/docs/source/cpp_api/core_bitset.rst
index af1cff6d37..117efc5466 100644
--- a/docs/source/cpp_api/core_bitset.rst
+++ b/docs/source/cpp_api/core_bitset.rst
@@ -12,4 +12,4 @@ namespace *raft::core*
 .. doxygengroup:: bitset
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
diff --git a/docs/source/cpp_api/core_kvp.rst b/docs/source/cpp_api/core_kvp.rst
index 60a0da078b..5f0cfd800a 100644
--- a/docs/source/cpp_api/core_kvp.rst
+++ b/docs/source/cpp_api/core_kvp.rst
@@ -12,4 +12,3 @@ namespace *raft::core*
 .. doxygenstruct:: raft::KeyValuePair
     :project: RAFT
     :members:
-
diff --git a/docs/source/cpp_api/core_logger.rst b/docs/source/cpp_api/core_logger.rst
index 60714a63ea..569f17fac3 100644
--- a/docs/source/cpp_api/core_logger.rst
+++ b/docs/source/cpp_api/core_logger.rst
@@ -12,4 +12,3 @@ namespace *raft::core*
 .. doxygenclass:: raft::logger
     :project: RAFT
     :members:
-
diff --git a/docs/source/cpp_api/core_nvtx.rst b/docs/source/cpp_api/core_nvtx.rst
index addcbdda30..051c66da0c 100644
--- a/docs/source/cpp_api/core_nvtx.rst
+++ b/docs/source/cpp_api/core_nvtx.rst
@@ -13,5 +13,3 @@ namespace *raft::core*
     :project: RAFT
     :members:
     :content-only:
-
-
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index 3cd928c9db..b9da44e431 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -4,7 +4,7 @@ Linear Algebra
 This page provides C++ class references for the publicly-exposed elements of the `raft/linalg` (dense) linear algebra headers.
 In addition to providing highly optimized arithmetic and matrix/vector operations, RAFT provides a consistent user experience
 by providing common BLAS routines, standard linear system solvers, factorization and eigenvalue solvers. Some of these routines
-hide the complexities of lower-level C-based libraries provided in the CUDA toolkit 
+hide the complexities of lower-level C-based libraries provided in the CUDA toolkit
 
 .. role:: py(code)
    :language: c++
@@ -19,4 +19,4 @@ hide the complexities of lower-level C-based libraries provided in the CUDA tool
    linalg_map_reduce.rst
    linalg_matrix.rst
    linalg_matrix_vector.rst
-   linalg_solver.rst
\ No newline at end of file
+   linalg_solver.rst
diff --git a/docs/source/cpp_api/linalg_arithmetic.rst b/docs/source/cpp_api/linalg_arithmetic.rst
index 7bc428b9f0..badb9f31a5 100644
--- a/docs/source/cpp_api/linalg_arithmetic.rst
+++ b/docs/source/cpp_api/linalg_arithmetic.rst
@@ -114,4 +114,3 @@ namespace *raft::linalg*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/linalg_matrix.rst b/docs/source/cpp_api/linalg_matrix.rst
index e6024bcd02..30eef5f64f 100644
--- a/docs/source/cpp_api/linalg_matrix.rst
+++ b/docs/source/cpp_api/linalg_matrix.rst
@@ -16,4 +16,3 @@ namespace *raft::linalg*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/linalg_matrix_vector.rst b/docs/source/cpp_api/linalg_matrix_vector.rst
index d92a3c9874..cc22327c74 100644
--- a/docs/source/cpp_api/linalg_matrix_vector.rst
+++ b/docs/source/cpp_api/linalg_matrix_vector.rst
@@ -29,4 +29,3 @@ namespace *raft::linalg*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/matrix_manipulation.rst b/docs/source/cpp_api/matrix_manipulation.rst
index d0da51e4b7..5437ced99f 100644
--- a/docs/source/cpp_api/matrix_manipulation.rst
+++ b/docs/source/cpp_api/matrix_manipulation.rst
@@ -41,4 +41,3 @@ namespace *raft::matrix*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/matrix_reduction.rst b/docs/source/cpp_api/matrix_reduction.rst
index 440a1528b4..92dcea6428 100644
--- a/docs/source/cpp_api/matrix_reduction.rst
+++ b/docs/source/cpp_api/matrix_reduction.rst
@@ -16,4 +16,4 @@ namespace *raft::matrix*
 .. doxygengroup:: matrix_norm
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
diff --git a/docs/source/cpp_api/mdspan_representation.rst b/docs/source/cpp_api/mdspan_representation.rst
index 386e6f14e9..939f1d51be 100644
--- a/docs/source/cpp_api/mdspan_representation.rst
+++ b/docs/source/cpp_api/mdspan_representation.rst
@@ -66,5 +66,3 @@ Accessors
 
 .. doxygentypedef:: raft::managed_accessor
     :project: RAFT
-
-
diff --git a/docs/source/cpp_api/mdspan_span.rst b/docs/source/cpp_api/mdspan_span.rst
index 870c4329d0..1b7d749810 100644
--- a/docs/source/cpp_api/mdspan_span.rst
+++ b/docs/source/cpp_api/mdspan_span.rst
@@ -25,4 +25,3 @@ span: One-dimensional Non-owning View
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/mnmg.rst b/docs/source/cpp_api/mnmg.rst
index 9543cbb4ee..1f9f75dd46 100644
--- a/docs/source/cpp_api/mnmg.rst
+++ b/docs/source/cpp_api/mnmg.rst
@@ -47,4 +47,3 @@ NCCL+UCX Comms
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/random.rst b/docs/source/cpp_api/random.rst
index 9f5cdc7a74..8eaa82c0b0 100644
--- a/docs/source/cpp_api/random.rst
+++ b/docs/source/cpp_api/random.rst
@@ -26,4 +26,3 @@ namespace *raft::random*
    random_sampling_univariate.rst
    random_sampling_multivariable.rst
    random_sampling_without_replacement.rst
-
diff --git a/docs/source/cpp_api/random_datagen.rst b/docs/source/cpp_api/random_datagen.rst
index a07f5e0154..e97283598e 100644
--- a/docs/source/cpp_api/random_datagen.rst
+++ b/docs/source/cpp_api/random_datagen.rst
@@ -43,4 +43,3 @@ namespace *raft::random*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/random_sampling_without_replacement.rst b/docs/source/cpp_api/random_sampling_without_replacement.rst
index ac0d3bea86..af5281a48b 100644
--- a/docs/source/cpp_api/random_sampling_without_replacement.rst
+++ b/docs/source/cpp_api/random_sampling_without_replacement.rst
@@ -22,5 +22,3 @@ namespace *raft::random*
     :project: RAFT
     :members:
     :content-only:
-
-
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index 64197accaf..ee170b3721 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -16,4 +16,3 @@ Core to RAFT's computational patterns for sparse data is its vocabulary of spars
    sparse_linalg.rst
    sparse_matrix.rst
    sparse_solver.rst
-
diff --git a/docs/source/cpp_api/sparse_types_coo_matrix.rst b/docs/source/cpp_api/sparse_types_coo_matrix.rst
index 855d89fdea..c1d8748a64 100644
--- a/docs/source/cpp_api/sparse_types_coo_matrix.rst
+++ b/docs/source/cpp_api/sparse_types_coo_matrix.rst
@@ -36,4 +36,3 @@ Host COO Matrix
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/sparse_types_csr_matrix.rst b/docs/source/cpp_api/sparse_types_csr_matrix.rst
index b704846c4e..22898a6399 100644
--- a/docs/source/cpp_api/sparse_types_csr_matrix.rst
+++ b/docs/source/cpp_api/sparse_types_csr_matrix.rst
@@ -36,4 +36,3 @@ Host CSR Matrix
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/stats_classification.rst b/docs/source/cpp_api/stats_classification.rst
index 929d2808f3..bc472c831d 100644
--- a/docs/source/cpp_api/stats_classification.rst
+++ b/docs/source/cpp_api/stats_classification.rst
@@ -17,4 +17,3 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/stats_probability.rst b/docs/source/cpp_api/stats_probability.rst
index 457879d87c..a77a0d9132 100644
--- a/docs/source/cpp_api/stats_probability.rst
+++ b/docs/source/cpp_api/stats_probability.rst
@@ -53,4 +53,3 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/stats_regression.rst b/docs/source/cpp_api/stats_regression.rst
index 8c172b441d..fed5f806a4 100644
--- a/docs/source/cpp_api/stats_regression.rst
+++ b/docs/source/cpp_api/stats_regression.rst
@@ -41,5 +41,3 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
-
-
diff --git a/docs/source/pylibraft_api/random.rst b/docs/source/pylibraft_api/random.rst
index 538d932757..dbfd7b2fa1 100644
--- a/docs/source/pylibraft_api/random.rst
+++ b/docs/source/pylibraft_api/random.rst
@@ -9,4 +9,4 @@ This page provides pylibraft class references for the publicly-exposed elements
    :class: highlight
 
 
-.. autofunction:: pylibraft.random.rmat
\ No newline at end of file
+.. autofunction:: pylibraft.random.rmat
diff --git a/docs/source/pylibraft_api/sparse.rst b/docs/source/pylibraft_api/sparse.rst
index b2c3f7a2b1..9ba265c6c9 100644
--- a/docs/source/pylibraft_api/sparse.rst
+++ b/docs/source/pylibraft_api/sparse.rst
@@ -8,4 +8,4 @@ This page provides pylibraft class references for the publicly-exposed elements
    :language: python
    :class: highlight
 
-.. autofunction:: pylibraft.sparse.linalg.eigsh
\ No newline at end of file
+.. autofunction:: pylibraft.sparse.linalg.eigsh
diff --git a/python/pylibraft/.coveragerc b/python/pylibraft/.coveragerc
index fc087fb9c5..3269e10b8a 100644
--- a/python/pylibraft/.coveragerc
+++ b/python/pylibraft/.coveragerc
@@ -1,3 +1,3 @@
 # Configuration file for Python coverage tests
 [run]
-source = pylibraft
\ No newline at end of file
+source = pylibraft
diff --git a/python/pylibraft/pylibraft/tests/pytest.ini b/python/pylibraft/pylibraft/tests/pytest.ini
index bf70c06f84..7b0a9f29fb 100644
--- a/python/pylibraft/pylibraft/tests/pytest.ini
+++ b/python/pylibraft/pylibraft/tests/pytest.ini
@@ -2,4 +2,3 @@
 
 [pytest]
 addopts = --tb=native
-
diff --git a/python/raft-dask/.coveragerc b/python/raft-dask/.coveragerc
index 968c4b898a..8077c9ae90 100644
--- a/python/raft-dask/.coveragerc
+++ b/python/raft-dask/.coveragerc
@@ -1,3 +1,3 @@
 # Configuration file for Python coverage tests
 [run]
-source = raft_dask
\ No newline at end of file
+source = raft_dask
diff --git a/python/raft-dask/raft_dask/tests/pytest.ini b/python/raft-dask/raft_dask/tests/pytest.ini
index bf70c06f84..7b0a9f29fb 100644
--- a/python/raft-dask/raft_dask/tests/pytest.ini
+++ b/python/raft-dask/raft_dask/tests/pytest.ini
@@ -2,4 +2,3 @@
 
 [pytest]
 addopts = --tb=native
-
diff --git a/thirdparty/LICENSES/LICENSE.ann-benchmark b/thirdparty/LICENSES/LICENSE.ann-benchmark
index 9f8e4222f6..4d04745ab4 100644
--- a/thirdparty/LICENSES/LICENSE.ann-benchmark
+++ b/thirdparty/LICENSES/LICENSE.ann-benchmark
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/thirdparty/LICENSES/LICENSE.faiss b/thirdparty/LICENSES/LICENSE.faiss
index 87cbf536c6..b96dcb0480 100644
--- a/thirdparty/LICENSES/LICENSE.faiss
+++ b/thirdparty/LICENSES/LICENSE.faiss
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/thirdparty/LICENSES/LICENSE.pytorch b/thirdparty/LICENSES/LICENSE.pytorch
index 7ad3d737a5..04f9ad1105 100644
--- a/thirdparty/LICENSES/LICENSE.pytorch
+++ b/thirdparty/LICENSES/LICENSE.pytorch
@@ -74,4 +74,4 @@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/LICENSES/mdarray.license b/thirdparty/LICENSES/mdarray.license
index e636b86032..5a491b0879 100644
--- a/thirdparty/LICENSES/mdarray.license
+++ b/thirdparty/LICENSES/mdarray.license
@@ -39,4 +39,4 @@
 //
 // ************************************************************************
 //@HEADER
-*/
\ No newline at end of file
+*/

From 31d31518ece63793ef4bdc2dab8ffac92fc6d6aa Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 28 Jan 2025 16:30:05 -0600
Subject: [PATCH 33/37] Build and test with CUDA 12.8.0 (#2555)

This PR uses CUDA 12.8.0 to build and test.

xref: https://github.com/rapidsai/build-planning/issues/139

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2555
---
 .../devcontainer.json                         |  8 ++---
 .../devcontainer.json                         | 12 +++----
 .github/workflows/build.yaml                  | 20 ++++++------
 .github/workflows/pr.yaml                     | 32 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 README.md                                     |  4 +--
 ...64.yaml => all_cuda-128_arch-aarch64.yaml} |  4 +--
 ..._64.yaml => all_cuda-128_arch-x86_64.yaml} |  4 +--
 dependencies.yaml                             |  6 +++-
 docs/source/build.md                          |  6 ++--
 11 files changed, 56 insertions(+), 52 deletions(-)
 rename .devcontainer/{cuda12.5-conda => cuda12.8-conda}/devcontainer.json (91%)
 rename .devcontainer/{cuda12.5-pip => cuda12.8-pip}/devcontainer.json (88%)
 rename conda/environments/{all_cuda-125_arch-aarch64.yaml => all_cuda-128_arch-aarch64.yaml} (95%)
 rename conda/environments/{all_cuda-125_arch-x86_64.yaml => all_cuda-128_arch-x86_64.yaml} (95%)

diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.5-conda/devcontainer.json
rename to .devcontainer/cuda12.8-conda/devcontainer.json
index dc4fcd02fd..0995e354af 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
similarity index 88%
rename from .devcontainer/cuda12.5-pip/devcontainer.json
rename to .devcontainer/cuda12.8-pip/devcontainer.json
index 2bcfa8733f..137699dc5f 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -3,20 +3,20 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.18.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
-      "version": "12.5",
+      "version": "12.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
@@ -29,7 +29,7 @@
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
@@ -37,7 +37,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d484bcae22..cdcb95efad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-libraft:
     needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -91,7 +91,7 @@ jobs:
   wheel-build-pylibraft:
     needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -122,7 +122,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9a51c783e9..af963bbc8a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -28,7 +28,7 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.8.0
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -46,7 +46,7 @@ jobs:
           repo: raft
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda-12.8.0
     with:
       files_yaml: |
         test_cpp:
@@ -70,47 +70,47 @@ jobs:
           - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.8.0
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
     with:
       build_type: pull-request
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -120,7 +120,7 @@ jobs:
   wheel-build-libraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
     with:
       build_type: pull-request
       branch: ${{ inputs.branch }}
@@ -132,14 +132,14 @@ jobs:
   wheel-build-pylibraft:
     needs: [checks, wheel-build-libraft]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: [wheel-build-pylibraft, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -147,24 +147,24 @@ jobs:
   wheel-build-raft-dask:
     needs: [checks, wheel-build-libraft]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: [wheel-build-raft-dask, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.8.0
     with:
       arch: '["amd64"]'
-      cuda: '["12.5"]'
+      cuda: '["12.8"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_PRIMS_BENCH=ON --verbose;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 178c6f677c..8a4d8a5eb4 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 01dd2436be..07f0f83cc9 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda-12.8.0
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/README.md b/README.md
index 898c5c22c3..2807ab50cc 100755
--- a/README.md
+++ b/README.md
@@ -240,7 +240,7 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers
 
 ```bash
 # for CUDA 12.5
-mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.5
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.8
 ```
 
 Note that the above commands will also install `libraft-headers` and `libraft`.
@@ -248,7 +248,7 @@ Note that the above commands will also install `libraft-headers` and `libraft`.
 You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project:
 ```bash
 # for CUDA 12.5
-mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5
+mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.8
 ```
 
 ### Installing Python through Pip
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
similarity index 95%
rename from conda/environments/all_cuda-125_arch-aarch64.yaml
rename to conda/environments/all_cuda-128_arch-aarch64.yaml
index d790e985fa..1915a3f0f0 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
@@ -53,4 +53,4 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.28
 - ucx-py==0.42.*,>=0.0.0a0
-name: all_cuda-125_arch-aarch64
+name: all_cuda-128_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
similarity index 95%
rename from conda/environments/all_cuda-125_arch-x86_64.yaml
rename to conda/environments/all_cuda-128_arch-x86_64.yaml
index 63808d99c0..c8119ff7d5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
@@ -53,4 +53,4 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.28
 - ucx-py==0.42.*,>=0.0.0a0
-name: all_cuda-125_arch-x86_64
+name: all_cuda-128_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index b7a0344b1a..c9befcb53a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.5"]
+      cuda: ["11.8", "12.8"]
       arch: [x86_64, aarch64]
     includes:
       - build_common
@@ -274,6 +274,10 @@ dependencies:
               cuda: "12.5"
             packages:
               - cuda-version=12.5
+          - matrix:
+              cuda: "12.8"
+            packages:
+              - cuda-version=12.8
   cuda:
     specific:
       - output_types: conda
diff --git a/docs/source/build.md b/docs/source/build.md
index 5a0dbf7e11..237c54ce6b 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -42,7 +42,7 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers
 
 ```bash
 # for CUDA 12.0
-mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.8
 ```
 
 Note that the above commands will also install `libraft-headers` and `libraft`.
@@ -50,7 +50,7 @@ Note that the above commands will also install `libraft-headers` and `libraft`.
 You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers to use in your project:
 ```bash
 # for CUDA 12.0
-mamba install -c rapidsai -c conda-forge -c nvidia libraft-headers cuda-version=12.0
+mamba install -c rapidsai -c conda-forge -c nvidia libraft-headers cuda-version=12.8
 ```
 
 ## Installing Python through Pip
@@ -99,7 +99,7 @@ In addition to the libraries included with cudatoolkit 11.8+, there are some oth
 
 Conda environment scripts are provided for installing the necessary dependencies to build both the C++ and Python libraries from source. It is preferred to use `mamba`, as it provides significant speedup over `conda`:
 ```bash
-mamba env create --name rapids_raft -f conda/environments/all_cuda-125_arch-x86_64.yaml
+mamba env create --name rapids_raft -f conda/environments/all_cuda-128_arch-x86_64.yaml
 mamba activate rapids_raft
 ```
 

From cceb37d953e1b4230f73157e3dba604176481547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Thu, 30 Jan 2025 04:45:50 +0100
Subject: [PATCH 34/37] Remove 'sample' parameter from stats::mean API (#2389)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR removes the sample-parameter from the `raft::stats::mean` API to prevent people from using it by accident when for example computing the mean for a sampled variance computation.

This also invalidates some of the testcases. Within raft only test-code is affected by this change as the active usage of the sample parameter was already removed in #2381.

This PR is based on #2381 but was separated for tracking purposes.

~~Note that this requires adaption of downstream libraries using the API. I am aware of at least one occurrence in `cuml`.~~
The old API remains in the code marked as deprecated which allows us to adapt downstream libraries at least for the duration of one release cycle.

Authors:
  - Malte Förster (https://github.com/mfoerste4)
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2389
---
 cpp/include/raft/stats/detail/mean.cuh   |  20 ++-
 cpp/include/raft/stats/detail/scores.cuh |   2 +-
 cpp/include/raft/stats/mean.cuh          |  66 +++++++++-
 cpp/tests/random/rng.cu                  |   3 +-
 cpp/tests/stats/cov.cu                   |   4 +-
 cpp/tests/stats/mean.cu                  | 121 ++++++++----------
 cpp/tests/stats/mean_center.cu           | 149 ++++++++---------------
 cpp/tests/stats/stddev.cu                |   6 +-
 8 files changed, 184 insertions(+), 187 deletions(-)

diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index a7d4f2b877..1262d538c8 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -27,7 +27,25 @@ namespace stats {
 namespace detail {
 
 template <typename Type, typename IdxType = int>
-void mean(
+void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
+  Type ratio = Type(1) / Type(N);
+  raft::linalg::reduce(mu,
+                       data,
+                       D,
+                       N,
+                       Type(0),
+                       rowMajor,
+                       false,
+                       stream,
+                       false,
+                       raft::identity_op(),
+                       raft::add_op(),
+                       raft::mul_const_op<Type>(ratio));
+}
+
+template <typename Type, typename IdxType = int>
+[[deprecated]] void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
   Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 947df6848a..66951f52ab 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -59,7 +59,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
   rmm::device_scalar<math_t> y_bar(stream);
 
-  raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream);
+  raft::stats::mean(y_bar.data(), y, 1, n, false, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   rmm::device_uvector<math_t> sse_arr(n, stream);
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index bc3cf184c6..b76b945400 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,27 @@ namespace stats {
  * @param data: the input matrix
  * @param D: number of columns of data
  * @param N: number of rows of data
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
+  detail::mean(mu, data, D, N, rowMajor, stream);
+}
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ * Note: This call is deprecated, please use `mean` call without `sample` parameter.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
  * @param sample: whether to evaluate sample mean or not. In other words,
  * whether
  *  to normalize the output using N-1 or N, for true or false, respectively
@@ -45,7 +66,7 @@ namespace stats {
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(
+[[deprecated("'sample' parameter deprecated")]] void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
   detail::mean(mu, data, D, N, sample, rowMajor, stream);
@@ -67,14 +88,47 @@ void mean(
  * @param[in]  handle the raft handle
  * @param[in]  data: the input matrix
  * @param[out] mu: the output mean vector
- * @param[in]  sample: whether to evaluate sample mean or not. In other words, whether
- *   to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
 void mean(raft::resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
-          raft::device_vector_view<value_t, idx_t> mu,
-          bool sample)
+          raft::device_vector_view<value_t, idx_t> mu)
+{
+  static_assert(
+    std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
+    "Data layout not supported");
+  RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
+  RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
+  RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
+  detail::mean(mu.data_handle(),
+               data.data_handle(),
+               data.extent(1),
+               data.extent(0),
+               std::is_same_v<layout_t, raft::row_major>,
+               resource::get_cuda_stream(handle));
+}
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ * Note: This call is deprecated, please use `mean` call without `sample` parameter.
+ *
+ * @tparam value_t the data type
+ * @tparam idx_t index type
+ * @tparam layout_t Layout type of the input matrix.
+ * @param[in]  handle the raft handle
+ * @param[in]  data: the input matrix
+ * @param[out] mu: the output mean vector
+ * @param[in]  sample: whether to evaluate sample mean or not. In other words, whether
+ *   to normalize the output using N-1 or N, for true or false, respectively
+ */
+template <typename value_t, typename idx_t, typename layout_t>
+[[deprecated("'sample' parameter deprecated")]] void mean(
+  raft::resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, layout_t> data,
+  raft::device_vector_view<value_t, idx_t> mu,
+  bool sample)
 {
   static_assert(
     std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
diff --git a/cpp/tests/random/rng.cu b/cpp/tests/random/rng.cu
index a37f150d4c..172f94ae50 100644
--- a/cpp/tests/random/rng.cu
+++ b/cpp/tests/random/rng.cu
@@ -407,8 +407,7 @@ TEST(Rng, MeanError)
     RngState r(seed, rtype);
     normal(handle, r, data.data(), len, 3.3f, 0.23f);
     // uniform(r, data, len, -1.0, 2.0);
-    raft::stats::mean(
-      mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream);
+    raft::stats::mean(mean_result.data(), data.data(), num_samples, num_experiments, false, stream);
     raft::stats::stddev(std_result.data(),
                         data.data(),
                         mean_result.data(),
diff --git a/cpp/tests/stats/cov.cu b/cpp/tests/stats/cov.cu
index 602f356b9f..3f2a3dcebf 100644
--- a/cpp/tests/stats/cov.cu
+++ b/cpp/tests/stats/cov.cu
@@ -72,7 +72,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     cov_act.resize(cols * cols, stream);
 
     normal(handle, r, data.data(), len, params.mean, var);
-    raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream);
+    raft::stats::mean(mean_act.data(), data.data(), cols, rows, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       cov(handle,
@@ -102,7 +102,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     raft::update_device(data_cm.data(), data_h, 6, stream);
     raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
 
-    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream);
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, stream);
     cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
   }
 
diff --git a/cpp/tests/stats/mean.cu b/cpp/tests/stats/mean.cu
index c5fe83d95b..e72d4eaf74 100644
--- a/cpp/tests/stats/mean.cu
+++ b/cpp/tests/stats/mean.cu
@@ -33,7 +33,7 @@ template <typename T>
 struct MeanInputs {
   T tolerance, mean;
   int rows, cols;
-  bool sample, rowMajor;
+  bool rowMajor;
   unsigned long long int seed;
   T stddev = (T)1.0;
 };
@@ -42,7 +42,7 @@ template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
 {
   return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
-            << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
+            << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
 }
 
 template <typename T>
@@ -74,14 +74,12 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
       using layout = raft::row_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
     } else {
       using layout = raft::col_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
     }
   }
 
@@ -98,72 +96,51 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
 const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},
-  {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 130, false, false, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 130, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, true, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, false, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, true, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, false, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, true, 1234ULL},
-  {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f},
-  {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}};
-
-const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, false, 1234ULL},
-  {0.15, -1.0, 1030, 60, true, false, 1234ULL},
-  {2.0, -1.0, 31, 120, false, false, 1234ULL},
-  {2.0, -1.0, 1, 130, false, false, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 60, true, true, 1234ULL},
-  {2.0, -1.0, 31, 120, false, true, 1234ULL},
-  {2.0, -1.0, 1, 130, false, true, 1234ULL},
-  {2.0, -1.0, 1, 1, false, false, 1234ULL},
-  {2.0, -1.0, 1, 1, false, true, 1234ULL},
-  {2.0, -1.0, 7, 23, false, false, 1234ULL},
-  {2.0, -1.0, 7, 23, false, true, 1234ULL},
-  {2.0, -1.0, 17, 5, false, false, 1234ULL},
-  {2.0, -1.0, 17, 5, false, true, 1234ULL},
-  {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001},
-  {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}};
+  {0.15f, -1.f, 1024, 32, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, true, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, 1234ULL},
+  {0.15f, -1.f, 1030, 1, true, 1234ULL},
+  {2.0f, -1.f, 31, 120, true, 1234ULL},
+  {2.0f, -1.f, 1, 130, true, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, 1234ULL},
+  {2.0f, -1.f, 1, 1, true, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, 1234ULL},
+  {2.0f, -1.f, 7, 23, true, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, 1234ULL},
+  {2.0f, -1.f, 17, 5, true, 1234ULL},
+  {0.0001f, 0.1f, 1 << 27, 2, false, 1234ULL, 0.0001f},
+  {0.0001f, 0.1f, 1 << 27, 2, true, 1234ULL, 0.0001f}};
+
+const std::vector<MeanInputs<double>> inputsd = {{0.15, -1.0, 1024, 32, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, true, 1234ULL},
+                                                 {0.15, -1.0, 1030, 1, false, 1234ULL},
+                                                 {2.0, -1.0, 31, 120, false, 1234ULL},
+                                                 {2.0, -1.0, 1, 130, false, 1234ULL},
+                                                 {0.15, -1.0, 1030, 1, true, 1234ULL},
+                                                 {2.0, -1.0, 31, 120, true, 1234ULL},
+                                                 {2.0, -1.0, 1, 130, true, 1234ULL},
+                                                 {2.0, -1.0, 1, 1, false, 1234ULL},
+                                                 {2.0, -1.0, 1, 1, true, 1234ULL},
+                                                 {2.0, -1.0, 7, 23, false, 1234ULL},
+                                                 {2.0, -1.0, 7, 23, true, 1234ULL},
+                                                 {2.0, -1.0, 17, 5, false, 1234ULL},
+                                                 {2.0, -1.0, 17, 5, true, 1234ULL},
+                                                 {1e-8, 1e-1, 1 << 27, 2, false, 1234ULL, 0.0001},
+                                                 {1e-8, 1e-1, 1 << 27, 2, true, 1234ULL, 0.0001}};
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result)
diff --git a/cpp/tests/stats/mean_center.cu b/cpp/tests/stats/mean_center.cu
index b44d87d1bd..48bf50056c 100644
--- a/cpp/tests/stats/mean_center.cu
+++ b/cpp/tests/stats/mean_center.cu
@@ -32,7 +32,7 @@ template <typename T, typename IdxType>
 struct MeanCenterInputs {
   T tolerance, mean;
   IdxType rows, cols;
-  bool sample, rowMajor, bcastAlongRows;
+  bool rowMajor, bcastAlongRows;
   unsigned long long int seed;
 };
 
@@ -64,8 +64,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
     auto len         = rows * cols;
     auto meanVecSize = params.bcastAlongRows ? cols : rows;
     normal(handle, r, data.data(), len, params.mean, (T)1.0);
-    raft::stats::mean(
-      meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       mean_center(handle,
@@ -103,30 +102,18 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
 };
 
 const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
-  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
+  {0.05f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
 TEST_P(MeanCenterTestF_i32, Result)
 {
@@ -136,30 +123,18 @@ TEST_P(MeanCenterTestF_i32, Result)
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
-  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
+  {0.05f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
 TEST_P(MeanCenterTestF_i64, Result)
 {
@@ -169,30 +144,18 @@ TEST_P(MeanCenterTestF_i64, Result)
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
-  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
+  {0.05, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
 TEST_P(MeanCenterTestD_i32, Result)
 {
@@ -202,30 +165,18 @@ TEST_P(MeanCenterTestD_i32, Result)
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
-  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
+  {0.05, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
 TEST_P(MeanCenterTestD_i64, Result)
 {
diff --git a/cpp/tests/stats/stddev.cu b/cpp/tests/stats/stddev.cu
index f4c5f92f49..a9a70b1e60 100644
--- a/cpp/tests/stats/stddev.cu
+++ b/cpp/tests/stats/stddev.cu
@@ -81,8 +81,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       using layout_t = raft::row_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           false);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
@@ -99,8 +98,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       using layout_t = raft::col_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T>(mean_act.data(), cols),
-           false);
+           raft::make_device_vector_view<T>(mean_act.data(), cols));
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),

From 19b8103077cbd5e16ad17c4a46788faf01fc9047 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 31 Jan 2025 08:02:58 -0800
Subject: [PATCH 35/37] Revert CUDA 12.8 shared workflow branch changes (#2560)

This PR points the shared workflow branches back to the default 25.02
branches.

xref: https://github.com/rapidsai/build-planning/issues/139
---
 .github/workflows/build.yaml                  | 20 ++++++-------
 .github/workflows/pr.yaml                     | 30 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++----
 .../trigger-breaking-change-alert.yaml        |  2 +-
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index cdcb95efad..d484bcae22 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-libraft:
     needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -91,7 +91,7 @@ jobs:
   wheel-build-pylibraft:
     needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -122,7 +122,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index af963bbc8a..dddee00d5f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -28,7 +28,7 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -46,7 +46,7 @@ jobs:
           repo: raft
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
     with:
       files_yaml: |
         test_cpp:
@@ -70,47 +70,47 @@ jobs:
           - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -120,7 +120,7 @@ jobs:
   wheel-build-libraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       branch: ${{ inputs.branch }}
@@ -132,14 +132,14 @@ jobs:
   wheel-build-pylibraft:
     needs: [checks, wheel-build-libraft]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: [wheel-build-pylibraft, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -147,21 +147,21 @@ jobs:
   wheel-build-raft-dask:
     needs: [checks, wheel-build-libraft]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: [wheel-build-raft-dask, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       arch: '["amd64"]'
       cuda: '["12.8"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8a4d8a5eb4..178c6f677c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 07f0f83cc9..01dd2436be 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda-12.8.0
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From e15a112d4f5f7f4fcf148ef5af15e8ed98ba89ba Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 31 Jan 2025 17:39:49 -0600
Subject: [PATCH 36/37] Fix docs builds (#2562)

This PR fixes two errors in docs builds:
1. a function with `void` return type had a `@return` parameter, which
causes an error. The error was `error: found documented return type for
raft::random::device::warp_random_sample that does not return anything`
2. a function with return type `std::vector<S>` was being misinterpreted
as the beginning of an HTML tag `<S>`. This resulted in `error: end of
comment block while expecting command </s>`.
---
 cpp/include/raft/random/device/sample.cuh | 6 ++++--
 cpp/include/raft/util/itertools.hpp       | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/random/device/sample.cuh b/cpp/include/raft/random/device/sample.cuh
index d0e5200185..67b98f12fe 100644
--- a/cpp/include/raft/random/device/sample.cuh
+++ b/cpp/include/raft/random/device/sample.cuh
@@ -27,12 +27,14 @@ namespace raft::random::device {
 
 /**
  * @brief warp-level random sampling of an index.
+ *
  * It selects an index with the given discrete probability
- * distribution(represented by weights of each index)
+ * distribution(represented by weights of each index).
+ * Only thread 0 will contain the valid reduced result.
+ *
  * @param rng random number generator, must have next_u32() function
  * @param weight weight of the rank/index.
  * @param idx index to be used as rank
- * @return only the thread0 will contain valid reduced result
  */
 template <typename T, typename rng_t, typename i_t = int>
 DI void warp_random_sample(rng_t& rng, T& weight, i_t& idx)
diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp
index 493ac9befe..a31d9f79df 100644
--- a/cpp/include/raft/util/itertools.hpp
+++ b/cpp/include/raft/util/itertools.hpp
@@ -36,7 +36,7 @@ namespace raft::util::itertools {
  *              fields of the structure (if the structure has more fields, some might be initialized
  *              with their default value).
  * @param lists One or more initializer lists.
- * @return std::vector<S> A vector of structures containing the cartesian product.
+ * @return `std::vector<S>` A vector of structures containing the cartesian product.
  */
 template <typename S, typename... Args>
 std::vector<S> product(std::initializer_list<Args>... lists)

From 7af57c3936313ecb5fab8dc0d758a26eb8f533ca Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Thu, 13 Feb 2025 09:44:59 -0600
Subject: [PATCH 37/37] Update Changelog [skip ci]

---
 CHANGELOG.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d7c641b21..a7f1d04beb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,59 @@
+# raft 25.02.00 (13 Feb 2025)
+
+## 🚨 Breaking Changes
+
+- Update pip devcontainers to UCX 1.18 ([#2550](https://github.com/rapidsai/raft/pull/2550)) [@jameslamb](https://github.com/jameslamb)
+- Switch over to rapids-logger ([#2530](https://github.com/rapidsai/raft/pull/2530)) [@vyasr](https://github.com/vyasr)
+- Adapt to rmm logger changes ([#2513](https://github.com/rapidsai/raft/pull/2513)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Rename test to tests. ([#2546](https://github.com/rapidsai/raft/pull/2546)) [@bdice](https://github.com/bdice)
+- Fix bit order of RMAT Rectangular Generator to match expectation ([#2542](https://github.com/rapidsai/raft/pull/2542)) [@mfoerste4](https://github.com/mfoerste4)
+- Fix broken link to python doc ([#2537](https://github.com/rapidsai/raft/pull/2537)) [@lowener](https://github.com/lowener)
+- Fix lanczos solver integer overflow ([#2536](https://github.com/rapidsai/raft/pull/2536)) [@viclafargue](https://github.com/viclafargue)
+- Fix rnd bit generation in rmat_rectangular_kernel ([#2524](https://github.com/rapidsai/raft/pull/2524)) [@tfeher](https://github.com/tfeher)
+
+## 📖 Documentation
+
+- Fix docs builds ([#2562](https://github.com/rapidsai/raft/pull/2562)) [@bdice](https://github.com/bdice)
+- [DOC] Fix sample codes ([#2518](https://github.com/rapidsai/raft/pull/2518)) [@enp1s0](https://github.com/enp1s0)
+
+## 🚀 New Features
+
+- Add cuda 12.8 support ([#2551](https://github.com/rapidsai/raft/pull/2551)) [@robertmaynard](https://github.com/robertmaynard)
+- Add support for different data type of bitset ([#2535](https://github.com/rapidsai/raft/pull/2535)) [@lowener](https://github.com/lowener)
+- [Feat] Support `bitset_to_csr` ([#2523](https://github.com/rapidsai/raft/pull/2523)) [@rhdong](https://github.com/rhdong)
+- Remove upper bounds on cuda-python to allow 12.6.2 and 11.8.5 ([#2517](https://github.com/rapidsai/raft/pull/2517)) [@bdice](https://github.com/bdice)
+
+## 🛠️ Improvements
+
+- Revert CUDA 12.8 shared workflow branch changes ([#2560](https://github.com/rapidsai/raft/pull/2560)) [@vyasr](https://github.com/vyasr)
+- Build and test with CUDA 12.8.0 ([#2555](https://github.com/rapidsai/raft/pull/2555)) [@bdice](https://github.com/bdice)
+- Update pip devcontainers to UCX 1.18 ([#2550](https://github.com/rapidsai/raft/pull/2550)) [@jameslamb](https://github.com/jameslamb)
+- use dynamic CUDA wheels on CUDA 11 ([#2548](https://github.com/rapidsai/raft/pull/2548)) [@jameslamb](https://github.com/jameslamb)
+- Normalize whitespace ([#2547](https://github.com/rapidsai/raft/pull/2547)) [@bdice](https://github.com/bdice)
+- Use cuda.bindings layout. ([#2545](https://github.com/rapidsai/raft/pull/2545)) [@bdice](https://github.com/bdice)
+- Revert &quot;Introduction of the `raft::device_resources_snmg` type ([#2487)&quot; (#2543](https://github.com/rapidsai/raft/pull/2487)&quot; (#2543)) [@cjnolet](https://github.com/cjnolet)
+- Add missing `#include &lt;cstdint&gt;` ([#2540](https://github.com/rapidsai/raft/pull/2540)) [@jakirkham](https://github.com/jakirkham)
+- Use GCC 13 in CUDA 12 conda builds. ([#2539](https://github.com/rapidsai/raft/pull/2539)) [@bdice](https://github.com/bdice)
+- Use rapids-cmake for the logger ([#2534](https://github.com/rapidsai/raft/pull/2534)) [@vyasr](https://github.com/vyasr)
+- Check if nightlies have succeeded recently enough ([#2533](https://github.com/rapidsai/raft/pull/2533)) [@vyasr](https://github.com/vyasr)
+- remove unused &#39;joblib&#39; and &#39;numba&#39; dependencies, other packaging cleanup ([#2532](https://github.com/rapidsai/raft/pull/2532)) [@jameslamb](https://github.com/jameslamb)
+- introduce libraft wheels ([#2531](https://github.com/rapidsai/raft/pull/2531)) [@jameslamb](https://github.com/jameslamb)
+- Switch over to rapids-logger ([#2530](https://github.com/rapidsai/raft/pull/2530)) [@vyasr](https://github.com/vyasr)
+- reduce duplication, removed unused things in dependencies.yaml ([#2529](https://github.com/rapidsai/raft/pull/2529)) [@jameslamb](https://github.com/jameslamb)
+- Update cuda-python lower bounds to 12.6.2 / 11.8.5 ([#2522](https://github.com/rapidsai/raft/pull/2522)) [@bdice](https://github.com/bdice)
+- [Opt] Optimizing the performance of `bitmap_to_csr` ([#2516](https://github.com/rapidsai/raft/pull/2516)) [@rhdong](https://github.com/rhdong)
+- prefer system install of UCX in devcontainers, update outdated RAPIDS references ([#2514](https://github.com/rapidsai/raft/pull/2514)) [@jameslamb](https://github.com/jameslamb)
+- Adapt to rmm logger changes ([#2513](https://github.com/rapidsai/raft/pull/2513)) [@vyasr](https://github.com/vyasr)
+- Require approval to run CI on draft PRs ([#2512](https://github.com/rapidsai/raft/pull/2512)) [@bdice](https://github.com/bdice)
+- Shrink wheel size limit following removal of vector search APIs. ([#2509](https://github.com/rapidsai/raft/pull/2509)) [@bdice](https://github.com/bdice)
+- Forward-merge branch-24.12 to branch-25.02 ([#2508](https://github.com/rapidsai/raft/pull/2508)) [@bdice](https://github.com/bdice)
+- Introduction of the `raft::device_resources_snmg` type ([#2487](https://github.com/rapidsai/raft/pull/2487)) [@viclafargue](https://github.com/viclafargue)
+- Add breaking change workflow trigger ([#2482](https://github.com/rapidsai/raft/pull/2482)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Remove &#39;sample&#39; parameter from stats::mean API ([#2389](https://github.com/rapidsai/raft/pull/2389)) [@mfoerste4](https://github.com/mfoerste4)
+
 # raft 24.12.00 (11 Dec 2024)
 
 ## 🚨 Breaking Changes