diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index dc12ab2ade..0f6a8b46af 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,7 @@ RUN apt update -y \
  && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
 
 ENV DEFAULT_VIRTUAL_ENV=rapids
+ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
 
 FROM ${BASE} as conda-base
 
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 008bf8730a..8c857961c2 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 75aed80f9f..94b0909f6c 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json
similarity index 87%
rename from .devcontainer/cuda12.5-conda/devcontainer.json
rename to .devcontainer/cuda12.8-conda/devcontainer.json
index 240ba02131..0995e354af 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.8-conda/devcontainer.json
@@ -3,24 +3,24 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json
similarity index 85%
rename from .devcontainer/cuda12.5-pip/devcontainer.json
rename to .devcontainer/cuda12.8-pip/devcontainer.json
index c23c79017a..137699dc5f 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.8-pip/devcontainer.json
@@ -3,33 +3,33 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
-      "version": "12.5",
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
+      "version": "12.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/raft,type=bind,consistency=consistent",
@@ -37,7 +37,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index 895ba83ee5..e0ea775aad 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -2,3 +2,4 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+auto_sync_draft: false
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 945589dc12..d484bcae22 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -66,9 +66,32 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-libraft:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libraft.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-publish-libraft:
+    needs: wheel-build-libraft
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: libraft
+      package-type: cpp
   wheel-build-pylibraft:
+    needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -78,16 +101,18 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: pylibraft
+      package-type: python
   wheel-build-raft-dask:
+    needs: wheel-build-libraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -97,10 +122,11 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: raft_dask
+      package-type: python
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9c22edf74c..dddee00d5f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -20,19 +21,32 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
+      - wheel-build-libraft
       - wheel-build-pylibraft
       - wheel-tests-pylibraft
       - wheel-build-raft-dask
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: raft
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
     with:
       files_yaml: |
         test_cpp:
@@ -56,89 +70,101 @@ jobs:
           - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-pylibraft:
+  wheel-build-libraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: pull-request
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libraft.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-build-pylibraft:
+    needs: [checks, wheel-build-libraft]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: [wheel-build-pylibraft, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
-    needs: wheel-tests-pylibraft
+    needs: [checks, wheel-build-libraft]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: [wheel-build-raft-dask, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       arch: '["amd64"]'
-      cuda: '["12.5"]'
+      cuda: '["12.8"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_PRIMS_BENCH=ON --verbose;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 92020f6a76..178c6f677c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
new file mode 100644
index 0000000000..01dd2436be
--- /dev/null
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -0,0 +1,26 @@
+name: Trigger Breaking Change Notifications
+
+on:
+  pull_request_target:
+    types:
+      - closed
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  trigger-notifier:
+    if: contains(github.event.pull_request.labels.*.name, 'breaking')
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
+    with:
+      sender_login: ${{ github.event.sender.login }}
+      sender_avatar: ${{ github.event.sender.avatar_url }}
+      repo: ${{ github.repository }}
+      pr_number: ${{ github.event.pull_request.number }}
+      pr_title: "${{ github.event.pull_request.title }}"
+      pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}"
+      pr_base_ref: ${{ github.event.pull_request.base.ref }}
+      pr_author: ${{ github.event.pull_request.user.login }}
+      event_action: ${{ github.event.action }}
+      pr_merged: ${{ github.event.pull_request.merged }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e3b3c8c440..6dfcc72417 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,11 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 repos:
+      - repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v5.0.0
+        hooks:
+              - id: trailing-whitespace
+              - id: end-of-file-fixer
       - repo: https://github.com/PyCQA/isort
         rev: 5.12.0
         hooks:
@@ -83,7 +88,7 @@ repos:
                 exclude: .*/thirdparty/.*
               - id: include-check
                 name: include-check
-                entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/test
+                entry: python ./cpp/scripts/include_checker.py cpp/bench cpp/include cpp/tests
                 pass_filenames: false
                 language: python
                 additional_dependencies: [gitpython]
@@ -98,7 +103,7 @@ repos:
                     ^CHANGELOG[.]md$|
                     ^cpp/cmake/patches/cutlass/build-export[.]patch$
       - repo: https://github.com/pre-commit/pre-commit-hooks
-        rev: v4.5.0
+        rev: v5.0.0
         hooks:
               - id: check-json
       - repo: https://github.com/rapidsai/pre-commit-hooks
@@ -110,8 +115,7 @@ repos:
                   [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$|
                   CMakeLists[.]txt$|
                   CMakeLists_standalone[.]txt$|
-                  meta[.]yaml$|
-                  setup[.]cfg$
+                  meta[.]yaml$
             exclude: |
               (?x)
                   cpp/include/raft/neighbors/detail/faiss_select/|
@@ -119,7 +123,7 @@ repos:
                   docs/source/sphinxext/github_link[.]py|
           - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.16.0
+        rev: v1.17.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d7c641b21..a7f1d04beb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,59 @@
+# raft 25.02.00 (13 Feb 2025)
+
+## 🚨 Breaking Changes
+
+- Update pip devcontainers to UCX 1.18 ([#2550](https://github.com/rapidsai/raft/pull/2550)) [@jameslamb](https://github.com/jameslamb)
+- Switch over to rapids-logger ([#2530](https://github.com/rapidsai/raft/pull/2530)) [@vyasr](https://github.com/vyasr)
+- Adapt to rmm logger changes ([#2513](https://github.com/rapidsai/raft/pull/2513)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Rename test to tests. ([#2546](https://github.com/rapidsai/raft/pull/2546)) [@bdice](https://github.com/bdice)
+- Fix bit order of RMAT Rectangular Generator to match expectation ([#2542](https://github.com/rapidsai/raft/pull/2542)) [@mfoerste4](https://github.com/mfoerste4)
+- Fix broken link to python doc ([#2537](https://github.com/rapidsai/raft/pull/2537)) [@lowener](https://github.com/lowener)
+- Fix lanczos solver integer overflow ([#2536](https://github.com/rapidsai/raft/pull/2536)) [@viclafargue](https://github.com/viclafargue)
+- Fix rnd bit generation in rmat_rectangular_kernel ([#2524](https://github.com/rapidsai/raft/pull/2524)) [@tfeher](https://github.com/tfeher)
+
+## 📖 Documentation
+
+- Fix docs builds ([#2562](https://github.com/rapidsai/raft/pull/2562)) [@bdice](https://github.com/bdice)
+- [DOC] Fix sample codes ([#2518](https://github.com/rapidsai/raft/pull/2518)) [@enp1s0](https://github.com/enp1s0)
+
+## 🚀 New Features
+
+- Add cuda 12.8 support ([#2551](https://github.com/rapidsai/raft/pull/2551)) [@robertmaynard](https://github.com/robertmaynard)
+- Add support for different data type of bitset ([#2535](https://github.com/rapidsai/raft/pull/2535)) [@lowener](https://github.com/lowener)
+- [Feat] Support `bitset_to_csr` ([#2523](https://github.com/rapidsai/raft/pull/2523)) [@rhdong](https://github.com/rhdong)
+- Remove upper bounds on cuda-python to allow 12.6.2 and 11.8.5 ([#2517](https://github.com/rapidsai/raft/pull/2517)) [@bdice](https://github.com/bdice)
+
+## 🛠️ Improvements
+
+- Revert CUDA 12.8 shared workflow branch changes ([#2560](https://github.com/rapidsai/raft/pull/2560)) [@vyasr](https://github.com/vyasr)
+- Build and test with CUDA 12.8.0 ([#2555](https://github.com/rapidsai/raft/pull/2555)) [@bdice](https://github.com/bdice)
+- Update pip devcontainers to UCX 1.18 ([#2550](https://github.com/rapidsai/raft/pull/2550)) [@jameslamb](https://github.com/jameslamb)
+- use dynamic CUDA wheels on CUDA 11 ([#2548](https://github.com/rapidsai/raft/pull/2548)) [@jameslamb](https://github.com/jameslamb)
+- Normalize whitespace ([#2547](https://github.com/rapidsai/raft/pull/2547)) [@bdice](https://github.com/bdice)
+- Use cuda.bindings layout. ([#2545](https://github.com/rapidsai/raft/pull/2545)) [@bdice](https://github.com/bdice)
+- Revert &quot;Introduction of the `raft::device_resources_snmg` type ([#2487)&quot; (#2543](https://github.com/rapidsai/raft/pull/2487)&quot; (#2543)) [@cjnolet](https://github.com/cjnolet)
+- Add missing `#include &lt;cstdint&gt;` ([#2540](https://github.com/rapidsai/raft/pull/2540)) [@jakirkham](https://github.com/jakirkham)
+- Use GCC 13 in CUDA 12 conda builds. ([#2539](https://github.com/rapidsai/raft/pull/2539)) [@bdice](https://github.com/bdice)
+- Use rapids-cmake for the logger ([#2534](https://github.com/rapidsai/raft/pull/2534)) [@vyasr](https://github.com/vyasr)
+- Check if nightlies have succeeded recently enough ([#2533](https://github.com/rapidsai/raft/pull/2533)) [@vyasr](https://github.com/vyasr)
+- remove unused &#39;joblib&#39; and &#39;numba&#39; dependencies, other packaging cleanup ([#2532](https://github.com/rapidsai/raft/pull/2532)) [@jameslamb](https://github.com/jameslamb)
+- introduce libraft wheels ([#2531](https://github.com/rapidsai/raft/pull/2531)) [@jameslamb](https://github.com/jameslamb)
+- Switch over to rapids-logger ([#2530](https://github.com/rapidsai/raft/pull/2530)) [@vyasr](https://github.com/vyasr)
+- reduce duplication, removed unused things in dependencies.yaml ([#2529](https://github.com/rapidsai/raft/pull/2529)) [@jameslamb](https://github.com/jameslamb)
+- Update cuda-python lower bounds to 12.6.2 / 11.8.5 ([#2522](https://github.com/rapidsai/raft/pull/2522)) [@bdice](https://github.com/bdice)
+- [Opt] Optimizing the performance of `bitmap_to_csr` ([#2516](https://github.com/rapidsai/raft/pull/2516)) [@rhdong](https://github.com/rhdong)
+- prefer system install of UCX in devcontainers, update outdated RAPIDS references ([#2514](https://github.com/rapidsai/raft/pull/2514)) [@jameslamb](https://github.com/jameslamb)
+- Adapt to rmm logger changes ([#2513](https://github.com/rapidsai/raft/pull/2513)) [@vyasr](https://github.com/vyasr)
+- Require approval to run CI on draft PRs ([#2512](https://github.com/rapidsai/raft/pull/2512)) [@bdice](https://github.com/bdice)
+- Shrink wheel size limit following removal of vector search APIs. ([#2509](https://github.com/rapidsai/raft/pull/2509)) [@bdice](https://github.com/bdice)
+- Forward-merge branch-24.12 to branch-25.02 ([#2508](https://github.com/rapidsai/raft/pull/2508)) [@bdice](https://github.com/bdice)
+- Introduction of the `raft::device_resources_snmg` type ([#2487](https://github.com/rapidsai/raft/pull/2487)) [@viclafargue](https://github.com/viclafargue)
+- Add breaking change workflow trigger ([#2482](https://github.com/rapidsai/raft/pull/2482)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Remove &#39;sample&#39; parameter from stats::mean API ([#2389](https://github.com/rapidsai/raft/pull/2389)) [@mfoerste4](https://github.com/mfoerste4)
+
 # raft 24.12.00 (11 Dec 2024)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index 898c5c22c3..2807ab50cc 100755
--- a/README.md
+++ b/README.md
@@ -240,7 +240,7 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers
 
 ```bash
 # for CUDA 12.5
-mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.5
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.8
 ```
 
 Note that the above commands will also install `libraft-headers` and `libraft`.
@@ -248,7 +248,7 @@ Note that the above commands will also install `libraft-headers` and `libraft`.
 You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers and pre-compiled shared library to use in your project:
 ```bash
 # for CUDA 12.5
-mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.5
+mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.8
 ```
 
 ### Installing Python through Pip
diff --git a/VERSION b/VERSION
index af28c42b52..72eefaf7c7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.12.00
+25.02.00
diff --git a/build.sh b/build.sh
index a95cb8ee23..8f388e549c 100755
--- a/build.sh
+++ b/build.sh
@@ -347,13 +347,8 @@ if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
 
-# Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
-SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
-    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
-fi
 # Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
-SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
+SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
 
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
@@ -478,4 +473,3 @@ if hasArg docs; then
     cd ${SPHINX_BUILD_DIR}
     sphinx-build -b html source _html
 fi
-
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 326ee9a4c7..976da98998 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -5,6 +5,7 @@ set -euo pipefail
 
 package_name=$1
 package_dir=$2
+package_type=$3
 underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
 # Clear out system ucx files to ensure that we're getting ucx from the wheel.
@@ -20,24 +21,21 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXCLUDE_ARGS=(
-      --exclude "libcublas.so.12"
-      --exclude "libcublasLt.so.12"
-      --exclude "libcurand.so.10"
-      --exclude "libcusolver.so.11"
-      --exclude "libcusparse.so.12"
-      --exclude "libnvJitLink.so.12"
-      --exclude "libucp.so.0"
+EXCLUDE_ARGS=(
+  --exclude "libcublas.so.*"
+  --exclude "libcublasLt.so.*"
+  --exclude "libcurand.so.*"
+  --exclude "libcusolver.so.*"
+  --exclude "libcusparse.so.*"
+  --exclude "libnvJitLink.so.*"
+  --exclude "libucp.so.*"
+)
+
+if [[ ${package_name} != "libraft" ]]; then
+    EXCLUDE_ARGS+=(
+      --exclude "libraft.so"
     )
-  ;;
-  11.*)
-    EXCLUDE_ARGS=(
-      --exclude "libucp.so.0"
-    )
-  ;;
-esac
+fi
 
 sccache --zero-stats
 
@@ -55,4 +53,4 @@ sccache --show-adv-stats
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_type} final_dist
diff --git a/ci/build_wheel_libraft.sh b/ci/build_wheel_libraft.sh
new file mode 100755
index 0000000000..8ff0da1e9a
--- /dev/null
+++ b/ci/build_wheel_libraft.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name="libraft"
+package_dir="python/libraft"
+
+rapids-logger "Generating build requirements"
+matrix_selectors="cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "${matrix_selectors}" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+ci/build_wheel.sh libraft ${package_dir} cpp
+ci/validate_wheel.sh ${package_dir} final_dist libraft
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index dacaa1190e..6f74e0e8c5 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -5,17 +5,16 @@ set -euo pipefail
 
 package_dir="python/pylibraft"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
-  ;;
-  11.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
-  ;;
-esac
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF${EXTRA_CMAKE_ARGS}"
+# Downloads libraft wheels from this current build,
+# then ensures 'pylibraft' wheel builds always use the 'libraft' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libraft_dist
+echo "libraft-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libraft_dist/libraft_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-ci/build_wheel.sh pylibraft ${package_dir}
-ci/validate_wheel.sh ${package_dir} final_dist
+ci/build_wheel.sh pylibraft ${package_dir} python
+ci/validate_wheel.sh ${package_dir} final_dist pylibraft
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
index e4f3f0a833..0cacb6fe30 100755
--- a/ci/build_wheel_raft_dask.sh
+++ b/ci/build_wheel_raft_dask.sh
@@ -5,8 +5,16 @@ set -euo pipefail
 
 package_dir="python/raft-dask"
 
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-ci/build_wheel.sh raft-dask ${package_dir}
-ci/validate_wheel.sh ${package_dir} final_dist
+# Downloads libraft wheels from this current build,
+# then ensures 'raft-dask' wheel builds always use the 'libraft' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libraft_dist
+echo "libraft-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libraft_dist/libraft_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
+
+ci/build_wheel.sh raft-dask ${package_dir} python
+ci/validate_wheel.sh ${package_dir} final_dist raft-dask
diff --git a/ci/check_style.sh b/ci/check_style.sh
index d7ba4cae25..e0c30a2d41 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,5 +14,12 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
+# get config for cmake-format checks
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+FORMAT_FILE_URL="https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json"
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
+
 # Run pre-commit checks
 pre-commit run --all-files --show-diff-on-failure
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a70fed9ec8..1ab9157b89 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,8 @@ echo "${NEXT_FULL_TAG}" > VERSION
 
 DEPENDENCIES=(
   dask-cuda
+  libraft
+  librmm
   pylibraft
   rmm
   rapids-dask-dependency
diff --git a/ci/run_pylibraft_pytests.sh b/ci/run_pylibraft_pytests.sh
index 1167b89c5f..7f3d1f9cfb 100755
--- a/ci/run_pylibraft_pytests.sh
+++ b/ci/run_pylibraft_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_pylibraft_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/pylibraft/pylibraft
 
-pytest --cache-clear "$@" test
+pytest --cache-clear "$@" tests
diff --git a/ci/run_raft_dask_pytests.sh b/ci/run_raft_dask_pytests.sh
index 07d0b5baa0..a9e6a130cd 100755
--- a/ci/run_raft_dask_pytests.sh
+++ b/ci/run_raft_dask_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_raft_dask_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/raft-dask/raft_dask
 
-pytest --cache-clear --import-mode=append "$@" test
+pytest --cache-clear --import-mode=append "$@" tests
diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index b38f5a690b..26f4da267f 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -5,9 +5,13 @@ set -euo pipefail
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libraft-dep
+RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+
 
 # echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/pylibraft*.whl)[test]
+python -m pip install \
+    ./local-libraft-dep/libraft*.whl \
+    "$(echo ./dist/pylibraft*.whl)[test]"
 
-python -m pytest ./python/pylibraft/pylibraft/test
+python -m pytest ./python/pylibraft/pylibraft/tests
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index a778a3ec51..c394314aac 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -5,17 +5,17 @@ set -euo pipefail
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-
-# Download the pylibraft built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
+RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libraft-dep
+RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibraft-dep
+RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install -v \
+    ./local-libraft-dep/libraft*.whl \
     ./local-pylibraft-dep/pylibraft*.whl \
     "$(echo ./dist/raft_dask_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
-test_dir="python/raft-dask/raft_dask/test"
+test_dir="python/raft-dask/raft_dask/tests"
 
 rapids-logger "pytest raft-dask"
 python -m pytest --import-mode=append ${test_dir}
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 5910a5c59f..ec3867aa30 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -5,6 +5,9 @@ set -euo pipefail
 
 package_dir=$1
 wheel_dir_relative_path=$2
+package_name=$3
+
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
 cd "${package_dir}"
 
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 6098cd12bf..ecd9aa1ece 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -14,19 +14,18 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -35,27 +34,27 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-aarch64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
+- sysroot_linux-aarch64==2.28
+- ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0fe8fbab39..2f655ae077 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,19 +14,18 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -35,27 +34,27 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
+- sysroot_linux-64==2.28
+- ucx-py==0.42.*,>=0.0.0a0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
similarity index 68%
rename from conda/environments/all_cuda-125_arch-x86_64.yaml
rename to conda/environments/all_cuda-128_arch-aarch64.yaml
index bf6f5d6462..1915a3f0f0 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -16,42 +16,41 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
-- cuda-version=12.5
+- cuda-python>=12.6.2,<13.0a0
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
-- gcc_linux-64=11.*
+- gcc_linux-aarch64=13.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
-name: all_cuda-125_arch-x86_64
+- sysroot_linux-aarch64==2.28
+- ucx-py==0.42.*,>=0.0.0a0
+name: all_cuda-128_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
similarity index 67%
rename from conda/environments/all_cuda-125_arch-aarch64.yaml
rename to conda/environments/all_cuda-128_arch-x86_64.yaml
index dfb9ac0b97..c8119ff7d5 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -16,42 +16,41 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
-- cuda-version=12.5
+- cuda-python>=12.6.2,<13.0a0
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0,<3.1.0a0
-- dask-cuda==24.12.*,>=0.0.0a0
-- distributed-ucxx==0.41.*,>=0.0.0a0
+- dask-cuda==25.2.*,>=0.0.0a0
+- distributed-ucxx==0.42.*,>=0.0.0a0
 - doxygen>=1.8.20
-- gcc_linux-aarch64=11.*
+- gcc_linux-64=13.*
 - graphviz
 - ipython
-- joblib>=0.11
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libucxx==0.41.*,>=0.0.0a0
+- libucxx==0.42.*,>=0.0.0a0
 - nccl>=2.19
 - ninja
-- numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.12.*,>=0.0.0a0
+- rapids-dask-dependency==25.2.*,>=0.0.0a0
 - recommonmark
-- rmm==24.12.*,>=0.0.0a0
+- rmm==25.2.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - scikit-learn
 - scipy
+- spdlog>=1.14.1,<1.15
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sysroot_linux-aarch64==2.17
-- ucx-py==0.41.*,>=0.0.0a0
-name: all_cuda-125_arch-aarch64
+- sysroot_linux-64==2.28
+- ucx-py==0.42.*,>=0.0.0a0
+name: all_cuda-128_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 4857f12cd1..11b16bc2a8 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 503c4cb6fb..dbde4e3971 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -39,10 +39,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         {% endif %}
         - librmm
@@ -51,7 +49,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -85,11 +83,7 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
-        {% endif %}
         - librmm
     requirements:
       host:
@@ -130,10 +124,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -145,7 +137,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -196,10 +188,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         {% endif %}
     requirements:
@@ -207,7 +197,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -258,10 +248,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -273,7 +261,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
index 001878ff25..83f5ebcb15 100644
--- a/conda/recipes/pylibraft/conda_build_config.yaml
+++ b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index 01a9d61f0f..8f498c7e50 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -1,7 +1,5 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-# Usage:
-#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
@@ -20,10 +18,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
     - cuda-python
@@ -33,7 +29,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
@@ -43,10 +39,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -61,10 +57,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
@@ -81,5 +77,5 @@ tests:
 about:
   home: https://rapids.ai/
   license: Apache-2.0
-  # license_file: LICENSE
+  license_file: LICENSE
   summary: pylibraft library
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index d7d2f68b42..d567266027 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -1,26 +1,26 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 ucx_py_version:
-  - "0.41.*"
+  - "0.42.*"
 
 ucxx_version:
-  - "0.41.*"
+  - "0.42.*"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 02a8957b06..29c7f568f1 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -1,7 +1,5 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-# Usage:
-#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
@@ -20,10 +18,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
     - cuda-python
@@ -33,7 +29,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
@@ -43,10 +39,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -62,15 +58,14 @@ requirements:
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - dask-cuda ={{ minor_version }}
     - rapids-dask-dependency ={{ minor_version }}
-    - joblib >=0.11
     - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
@@ -87,5 +82,5 @@ tests:
 about:
   home: https://rapids.ai/
   license: Apache-2.0
-  # license_file: LICENSE
+  license_file: LICENSE
   summary: raft-dask library
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 780f6f8581..c38471bebd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -65,9 +65,12 @@ set(RAFT_COMPILE_LIBRARY_DEFAULT OFF)
 if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
   set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
 endif()
-option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
+option(RAFT_COMPILE_LIBRARY "Enable building raft library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
+option(RAFT_COMPILE_DYNAMIC_ONLY "Only build the shared library and skip the
+static library. Has no effect if RAFT_COMPILE_LIBRARY is OFF" OFF
+)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
 # have different values for the `Threads::Threads` target. Setting this flag ensures
@@ -100,6 +103,17 @@ set_property(
 )
 message(VERBOSE "RAFT: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 
+# Set logging level
+set(LIBRAFT_LOGGING_LEVEL
+    "INFO"
+    CACHE STRING "Choose the logging level."
+)
+set_property(
+  CACHE LIBRAFT_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL"
+                                       "OFF"
+)
+message(VERBOSE "RAFT: LIBRAFT_LOGGING_LEVEL = '${LIBRAFT_LOGGING_LEVEL}'.")
+
 # ##################################################################################################
 # * Conda environment detection ----------------------------------------------
 
@@ -152,6 +166,10 @@ include(cmake/modules/ConfigureCUDA.cmake)
 # add third party dependencies using CPM
 rapids_cpm_init()
 
+include(${rapids-cmake-dir}/cpm/rapids_logger.cmake)
+rapids_cpm_rapids_logger()
+rapids_make_logger(raft LOGGER_HEADER_DIR include/raft/core EXPORT_SET raft-exports)
+
 # CCCL before rmm/cuco so we get the right version of CCCL
 include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
@@ -180,13 +198,19 @@ target_include_directories(
 )
 
 # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target.
-target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass CCCL::CCCL)
+target_link_libraries(
+  raft INTERFACE rmm::rmm rmm::rmm_logger spdlog::spdlog_header_only cuco::cuco
+                 nvidia::cutlass::cutlass CCCL::CCCL raft_logger
+)
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 target_compile_options(
   raft INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
                  --expt-relaxed-constexpr>
 )
+target_compile_definitions(
+  raft INTERFACE "RAFT_LOG_ACTIVE_LEVEL=RAFT_LOG_LEVEL_${LIBRAFT_LOGGING_LEVEL}"
+)
 
 set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix})
 set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix})
@@ -262,7 +286,6 @@ set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 if(RAFT_COMPILE_LIBRARY)
   add_library(
     raft_objs OBJECT
-    src/core/logger.cpp
     src/linalg/detail/coalesced_reduction.cu
     src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
     src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
@@ -288,18 +311,26 @@ if(RAFT_COMPILE_LIBRARY)
                       "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  add_library(raft_lib SHARED $<TARGET_OBJECTS:raft_objs>)
-  add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
+  # Make sure not to add the rmm logger twice since it will be brought in as an interface source by
+  # the rmm::rmm_logger_impl target.
+  add_library(raft_lib SHARED $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
+
+  set(_raft_lib_targets raft_lib)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    add_library(raft_lib_static STATIC $<FILTER:$<TARGET_OBJECTS:raft_objs>,EXCLUDE,rmm.*logger>)
+    list(APPEND _raft_lib_targets raft_lib_static)
+  endif()
 
   set_target_properties(
-    raft_lib raft_lib_static
+    ${_raft_lib_targets}
     PROPERTIES OUTPUT_NAME raft
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  foreach(target raft_lib raft_lib_static raft_objs)
+  list(APPEND _raft_lib_targets raft_objs)
+  foreach(target IN LISTS _raft_lib_targets)
     target_link_libraries(
       ${target}
       PUBLIC raft::raft
@@ -313,6 +344,10 @@ if(RAFT_COMPILE_LIBRARY)
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
     target_link_options(${target} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
   endforeach()
+  target_link_libraries(raft_lib PRIVATE rmm::rmm_logger_impl raft_logger_impl)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    target_link_libraries(raft_lib_static PRIVATE rmm::rmm_logger_impl raft_logger_impl)
+  endif()
 endif()
 
 if(TARGET raft_lib AND (NOT TARGET raft::raft_lib))
@@ -324,20 +359,22 @@ target_link_libraries(raft_compiled INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS
 # ##################################################################################################
 # * raft_compiled_static----------------------------------------------------------------------------
 
-add_library(raft_compiled_static INTERFACE)
+if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+  add_library(raft_compiled_static INTERFACE)
 
-if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
-  add_library(raft::compiled_static ALIAS raft_compiled_static)
-endif()
-set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
+  if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
+    add_library(raft::compiled_static ALIAS raft_compiled_static)
+  endif()
+  set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
 
-if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
-  add_library(raft::raft_lib_static ALIAS raft_lib_static)
-endif()
+  if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
+    add_library(raft::raft_lib_static ALIAS raft_lib_static)
+  endif()
 
-target_link_libraries(
-  raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
-)
+  target_link_libraries(
+    raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
+  )
+endif()
 
 # ##################################################################################################
 # * raft_distributed -------------------------------------------------------------------------------
@@ -386,8 +423,12 @@ install(
   EXPORT raft-exports
 )
 
+set(_raft_compiled_install_targets raft_compiled)
+if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+  list(APPEND _raft_compiled_install_targets raft_compiled_static)
+endif()
 install(
-  TARGETS raft_compiled raft_compiled_static
+  TARGETS ${_raft_compiled_install_targets}
   DESTINATION ${lib_dir}
   COMPONENT raft
   EXPORT raft-compiled-exports
@@ -400,12 +441,14 @@ if(TARGET raft_lib)
     COMPONENT compiled
     EXPORT raft-compiled-lib-exports
   )
-  install(
-    TARGETS raft_lib_static
-    DESTINATION ${lib_dir}
-    COMPONENT compiled-static
-    EXPORT raft-compiled-static-lib-exports
-  )
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    install(
+      TARGETS raft_lib_static
+      DESTINATION ${lib_dir}
+      COMPONENT compiled-static
+      EXPORT raft-compiled-static-lib-exports
+    )
+  endif()
   install(
     DIRECTORY include/raft_runtime
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
@@ -476,8 +519,12 @@ endif()
 set(raft_components compiled distributed)
 set(raft_export_sets raft-compiled-exports raft-distributed-exports)
 if(TARGET raft_lib)
-  list(APPEND raft_components compiled compiled-static)
-  list(APPEND raft_export_sets raft-compiled-lib-exports raft-compiled-static-lib-exports)
+  list(APPEND raft_components compiled)
+  list(APPEND raft_export_sets raft-compiled-lib-exports)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    list(APPEND raft_components compiled-static)
+    list(APPEND raft_export_sets raft-compiled-static-lib-exports)
+  endif()
 endif()
 
 string(
@@ -539,7 +586,7 @@ endif()
 # * build test executable ----------------------------------------------------
 
 if(BUILD_TESTS)
-  add_subdirectory(test)
+  add_subdirectory(tests)
 endif()
 
 # ##################################################################################################
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index cf03a36612..edc1af4e02 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -32,6 +32,7 @@ function(ConfigureBench)
     PRIVATE raft::raft
             raft_internal
             $<$<BOOL:${ConfigureBench_LIB}>:raft::compiled>
+            $<$<NOT:$<BOOL:${ConfigureBench_LIB}>>:bench_rmm_logger>
             ${RAFT_CTK_MATH_DEPENDENCIES}
             benchmark::benchmark
             Threads::Threads
@@ -73,6 +74,9 @@ function(ConfigureBench)
 
 endfunction()
 
+add_library(bench_rmm_logger OBJECT)
+target_link_libraries(bench_rmm_logger PRIVATE rmm::rmm_logger_impl)
+
 if(BUILD_PRIMS_BENCH)
   ConfigureBench(NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp)
 
diff --git a/cpp/bench/prims/linalg/masked_matmul.cu b/cpp/bench/prims/linalg/masked_matmul.cu
index eda9cb1710..b96e14a25d 100644
--- a/cpp/bench/prims/linalg/masked_matmul.cu
+++ b/cpp/bench/prims/linalg/masked_matmul.cu
@@ -22,7 +22,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/random/rng.cuh>
-#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/sparse/linalg/masked_matmul.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <cusparse_v2.h>
@@ -49,11 +49,14 @@ inline auto operator<<(std::ostream& os, const MaskedMatmulBenchParams<value_t>&
 {
   os << " m*k*n=" << params.m << "*" << params.k << "*" << params.n
      << "\tsparsity=" << params.sparsity;
-  if (params.sparsity == 1.0) { os << "<-inner product for comparison"; }
+  if (params.sparsity == 0.0) { os << "<-inner product for comparison"; }
   return os;
 }
 
-template <typename value_t, typename index_t = int64_t, typename bitmap_t = uint32_t>
+template <typename value_t,
+          bool bitmap_or_bitset = true,
+          typename index_t      = int64_t,
+          typename bits_t       = uint32_t>
 struct MaskedMatmulBench : public fixture {
   MaskedMatmulBench(const MaskedMatmulBenchParams<value_t>& p)
     : fixture(true),
@@ -64,15 +67,15 @@ struct MaskedMatmulBench : public fixture {
       c_indptr_d(0, stream),
       c_indices_d(0, stream),
       c_data_d(0, stream),
-      bitmap_d(0, stream),
+      bits_d(0, stream),
       c_dense_data_d(0, stream)
   {
-    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bitmap_t) * 8));
-    std::vector<bitmap_t> bitmap_h(element);
+    index_t element = raft::ceildiv(index_t(params.m * params.n), index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
 
     a_data_d.resize(params.m * params.k, stream);
     b_data_d.resize(params.k * params.n, stream);
-    bitmap_d.resize(element, stream);
+    bits_d.resize(element, stream);
 
     raft::random::RngState rng(2024ULL);
     raft::random::uniform(
@@ -82,7 +85,13 @@ struct MaskedMatmulBench : public fixture {
 
     std::vector<bool> c_dense_data_h(params.m * params.n);
 
-    c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+    if constexpr (bitmap_or_bitset) {
+      c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
+    } else {
+      c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
+      repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
+      c_true_nnz *= params.m;
+    }
 
     std::vector<value_t> values(c_true_nnz);
     std::vector<index_t> indices(c_true_nnz);
@@ -93,24 +102,49 @@ struct MaskedMatmulBench : public fixture {
     c_indices_d.resize(c_true_nnz, stream);
     c_dense_data_d.resize(params.m * params.n, stream);
 
-    cpu_convert_to_csr(bitmap_h, params.m, params.n, indices, indptr);
+    cpu_convert_to_csr(bits_h, params.m, params.n, indices, indptr);
     RAFT_EXPECTS(c_true_nnz == c_indices_d.size(),
                  "Something wrong. The c_true_nnz != c_indices_d.size()!");
 
     update_device(c_data_d.data(), values.data(), c_true_nnz, stream);
     update_device(c_indices_d.data(), indices.data(), c_true_nnz, stream);
     update_device(c_indptr_d.data(), indptr.data(), params.m + 1, stream);
-    update_device(bitmap_d.data(), bitmap_h.data(), element, stream);
+    update_device(bits_d.data(), bits_h.data(), element, stream);
+  }
+
+  void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
+  {
+    size_t output_bit_index = input_bits;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bits_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bits_t) * 8);
+        bool bit                = (inout[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);
+
+        inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
   }
 
-  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
-    for (auto& item : bitmap) {
-      item = static_cast<bitmap_t>(0);
+    if (sparsity == 0.0f) {
+      std::fill(bits.begin(), bits.end(), 0xffffffff);
+      return num_ones;
+    }
+
+    for (auto& item : bits) {
+      item = static_cast<bits_t>(0);
     }
 
     std::random_device rd;
@@ -120,8 +154,8 @@ struct MaskedMatmulBench : public fixture {
     while (num_ones > 0) {
       index_t index = dis(gen);
 
-      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
-      index_t bit_position = index % (8 * sizeof(bitmap_t));
+      bits_t& element      = bits[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
         element |= (static_cast<index_t>(1) << bit_position);
@@ -131,7 +165,7 @@ struct MaskedMatmulBench : public fixture {
     return res;
   }
 
-  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+  void cpu_convert_to_csr(std::vector<bits_t>& bits,
                           index_t rows,
                           index_t cols,
                           std::vector<index_t>& indices,
@@ -142,14 +176,14 @@ struct MaskedMatmulBench : public fixture {
     indptr[offset_indptr++] = 0;
 
     index_t index        = 0;
-    bitmap_t element     = 0;
+    bits_t element       = 0;
     index_t bit_position = 0;
 
     for (index_t i = 0; i < rows; ++i) {
       for (index_t j = 0; j < cols; ++j) {
         index        = i * cols + j;
-        element      = bitmap[index / (8 * sizeof(bitmap_t))];
-        bit_position = index % (8 * sizeof(bitmap_t));
+        element      = bits[index / (8 * sizeof(bits_t))];
+        bit_position = index % (8 * sizeof(bits_t));
 
         if (((element >> bit_position) & 1)) {
           indices[offset_values] = static_cast<index_t>(j);
@@ -181,13 +215,17 @@ struct MaskedMatmulBench : public fixture {
       params.n,
       static_cast<index_t>(c_indices_d.size()));
 
-    auto mask =
-      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
-
     auto c = raft::make_device_csr_matrix_view<value_t>(c_data_d.data(), c_structure);
 
-    if (params.sparsity < 1.0) {
-      raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    if (params.sparsity > 0.0) {
+      if constexpr (bitmap_or_bitset) {
+        auto mask =
+          raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      } else {
+        auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+      }
     } else {
       raft::distance::pairwise_distance(handle,
                                         a_data_d.data(),
@@ -201,12 +239,16 @@ struct MaskedMatmulBench : public fixture {
     }
     resource::sync_stream(handle);
 
-    raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
-    resource::sync_stream(handle);
-
-    loop_on_state(state, [this, &a, &b, &mask, &c]() {
-      if (params.sparsity < 1.0) {
-        raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+    loop_on_state(state, [this, &a, &b, &c]() {
+      if (params.sparsity > 0.0) {
+        if constexpr (bitmap_or_bitset) {
+          auto mask =
+            raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+          raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+        } else {
+          auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+          raft::sparse::linalg::masked_matmul(handle, a, b, mask, c);
+        }
       } else {
         raft::distance::pairwise_distance(handle,
                                           a_data_d.data(),
@@ -228,7 +270,7 @@ struct MaskedMatmulBench : public fixture {
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
-  rmm::device_uvector<bitmap_t> bitmap_d;
+  rmm::device_uvector<bits_t> bits_d;
 
   rmm::device_uvector<value_t> c_dense_data_d;
 
@@ -253,7 +295,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
     raft::util::itertools::product<TestParams>({size_t(10), size_t(1024)},
                                                {size_t(128), size_t(1024)},
                                                {size_t(1024 * 1024)},
-                                               {0.01f, 0.1f, 0.2f, 0.5f, 1.0f});
+                                               {0.99f, 0.9f, 0.8f, 0.5f, 0.0f});
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
@@ -263,6 +305,7 @@ static std::vector<MaskedMatmulBenchParams<value_t>> getInputs()
   return param_vec;
 }
 
-RAFT_BENCH_REGISTER((MaskedMatmulBench<float>), "", getInputs<float>());
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float, true>), "", getInputs<float>());
+RAFT_BENCH_REGISTER((MaskedMatmulBench<float, false>), "", getInputs<float>());
 
 }  // namespace raft::bench::linalg
diff --git a/cpp/bench/prims/sparse/bitmap_to_csr.cu b/cpp/bench/prims/sparse/bitmap_to_csr.cu
index ed53df3265..71aabb1bf9 100644
--- a/cpp/bench/prims/sparse/bitmap_to_csr.cu
+++ b/cpp/bench/prims/sparse/bitmap_to_csr.cu
@@ -71,7 +71,7 @@ struct BitmapToCsrBench : public fixture {
   index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
     for (auto& item : bitmap) {
@@ -141,7 +141,27 @@ const std::vector<bench_param<index_t>> getInputs()
   };
 
   const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
-    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.01f, 0.1f, 0.2f, 0.5f});
+    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+template <typename index_t = int64_t>
+const std::vector<bench_param<index_t>> getLargeInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});
 
   param_vec.reserve(params_group.size());
   for (TestParams params : params_group) {
@@ -153,4 +173,6 @@ const std::vector<bench_param<index_t>> getInputs()
 RAFT_BENCH_REGISTER((BitmapToCsrBench<uint32_t, int, float>), "", getInputs<int>());
 RAFT_BENCH_REGISTER((BitmapToCsrBench<uint64_t, int, double>), "", getInputs<int>());
 
+RAFT_BENCH_REGISTER((BitmapToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());
+
 }  // namespace raft::bench::sparse
diff --git a/cpp/bench/prims/sparse/bitset_to_csr.cu b/cpp/bench/prims/sparse/bitset_to_csr.cu
new file mode 100644
index 0000000000..fef2d44d3e
--- /dev/null
+++ b/cpp/bench/prims/sparse/bitset_to_csr.cu
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <sstream>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_repeat;
+  index_t n_cols;
+  float sparsity;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << " rows*cols=" << params.n_repeat << "*" << params.n_cols
+     << "\tsparsity=" << params.sparsity;
+  return os;
+}
+
+template <typename bitset_t, typename index_t, typename value_t = float>
+struct BitsetToCsrBench : public fixture {
+  BitsetToCsrBench(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      bitset_d(0, stream),
+      nnz(0),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      values_d(0, stream)
+  {
+    index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> bitset_h(element);
+    nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);
+
+    bitset_d.resize(bitset_h.size(), stream);
+    indptr_d.resize(params.n_repeat + 1, stream);
+    indices_d.resize(nnz, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
+    index_t res      = num_ones;
+
+    for (auto& item : bitset) {
+      item = static_cast<bitset_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitset_t& element    = bitset[index / (8 * sizeof(bitset_t))];
+      index_t bit_position = index % (8 * sizeof(bitset_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), 1 * params.n_cols);
+
+    auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+    auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
+
+    raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
+
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &bitset, &csr]() {
+      raft::sparse::convert::bitset_to_csr<bitset_t, index_t>(handle, bitset, csr);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+
+  rmm::device_uvector<bitset_t> bitset_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<value_t> values_d;
+
+  index_t nnz;
+};  // struct BitsetToCsrBench
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(10), index_t(1024)}, {index_t(1024 * 1024)}, {0.99f, 0.9f, 0.8f, 0.5f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+template <typename index_t = int64_t>
+const std::vector<bench_param<index_t>> getLargeInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    float sparsity;
+  };
+
+  const std::vector<TestParams> params_group = raft::util::itertools::product<TestParams>(
+    {index_t(1), index_t(100)}, {index_t(100 * 1000000)}, {0.95f, 0.99f});
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.sparsity}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int, float>), "", getInputs<int>());
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint64_t, int, double>), "", getInputs<int>());
+
+RAFT_BENCH_REGISTER((BitsetToCsrBench<uint32_t, int64_t, float>), "", getLargeInputs<int64_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index b364d8418d..fbf4428650 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,7 +14,9 @@
 
 if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
-  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations
+       -DRAFT_HIDE_DEPRECATION_WARNINGS
+  )
 endif()
 
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
@@ -27,6 +29,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
     list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
   endif()
+
+  # Allow invalid CUDA kernels in the short term
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
+    list(APPEND RAFT_CUDA_FLAGS -static-global-template-stub=false)
+  endif()
+
 endif()
 
 if(CUDA_LOG_COMPILE_TIME)
diff --git a/cpp/cmake/patches/cutlass/build-export.patch b/cpp/cmake/patches/cutlass/build-export.patch
index a6423e9c08..31bbd25102 100644
--- a/cpp/cmake/patches/cutlass/build-export.patch
+++ b/cpp/cmake/patches/cutlass/build-export.patch
@@ -20,8 +20,7 @@ index 7419bdf5e..545384d82 100755
 -  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
 -  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
    )
- 
+
  # Mark CTK headers as system to supress warnings from them
--- 
+--
 2.34.1
-
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 5a7d54ea4a..0e93363039 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -17,7 +17,7 @@
 function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
     rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports
-                   INSTALL_EXPORT_SET  raft-exports)
+                   INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index 57e38c2638..b1ffbe246f 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -16,9 +16,9 @@
 function(find_and_configure_spdlog)
 
     include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-    rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports)
-    rapids_export_package(BUILD spdlog rmm-exports)
+    rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET raft-exports)
+    rapids_export_package(BUILD spdlog raft-exports)
 
 endfunction()
 
-find_and_configure_spdlog()
\ No newline at end of file
+find_and_configure_spdlog()
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 4efeedcbaa..4203f0969b 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -369,7 +369,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      rmm::device_uvector<char>& workspace)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -865,7 +865,7 @@ void kmeans_fit(raft::resources const& handle,
       params.n_clusters);
   }
 
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
 
   // Allocate memory
   rmm::device_uvector<char> workspace(0, stream);
@@ -1010,7 +1010,7 @@ void kmeans_predict(raft::resources const& handle,
   RAFT_EXPECTS(centroids.extent(1) == n_features,
                "invalid parameter (centroids.extent(1) != n_features)");
 
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
   auto metric = params.metric;
 
   // Allocate memory
@@ -1201,7 +1201,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<DataT> X_new)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("kmeans_transform");
-  logger::get(RAFT_NAME).set_level(params.verbosity);
+  default_logger().set_level(params.verbosity);
   cudaStream_t stream = resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
diff --git a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh
index 97755351c4..f3e2c78584 100644
--- a/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_auto_find_k.cuh
@@ -227,4 +227,4 @@ void find_k(raft::resources const& handle,
                                                               n_iter);
   }
 }
-}  // namespace raft::cluster::detail
\ No newline at end of file
+}  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index 0a5a3ba5aa..5dcd679bd5 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -20,6 +20,7 @@
 #include <raft/cluster/kmeans_balanced_types.hpp>
 #include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/logger-macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index 55becc8e15..2b77ca9963 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -204,4 +204,4 @@ void build_sorted_mst(
   raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream);
 }
 
-};  // namespace raft::cluster::detail
\ No newline at end of file
+};  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/detail/single_linkage.cuh b/cpp/include/raft/cluster/detail/single_linkage.cuh
index ccc6472684..0a21271271 100644
--- a/cpp/include/raft/cluster/detail/single_linkage.cuh
+++ b/cpp/include/raft/cluster/detail/single_linkage.cuh
@@ -122,4 +122,4 @@ void single_linkage(raft::resources const& handle,
   out->n_leaves               = m;
   out->n_connected_components = 1;
 }
-};  // namespace raft::cluster::detail
\ No newline at end of file
+};  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index 38318e8ec8..ee1fc83a9b 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -52,7 +52,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::raft::resources handle;
+ *   raft::resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -61,7 +61,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  *               params,
  *               X,
  *               std::nullopt,
- *               centroids,
+ *               centroids.view(),
  *               raft::make_scalar_view(&inertia),
  *               raft::make_scalar_view(&n_iter));
  * @endcode
@@ -107,7 +107,7 @@ template <typename DataT, typename IndexT>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::raft::resources handle;
+ *   raft::resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
@@ -175,7 +175,7 @@ template <typename DataT, typename IndexT>
  *   #include <raft/cluster/kmeans_types.hpp>
  *   using namespace raft::cluster;
  *   ...
- *   raft::raft::resources handle;
+ *   raft::resources handle;
  *   raft::cluster::KMeansParams params;
  *   int n_features = 15, inertia, n_iter;
  *   auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp
index 4d956ad7a0..fbedd58417 100644
--- a/cpp/include/raft/cluster/kmeans_types.hpp
+++ b/cpp/include/raft/cluster/kmeans_types.hpp
@@ -82,7 +82,7 @@ struct KMeansParams : kmeans_base_params {
   /**
    * verbosity level.
    */
-  int verbosity = RAFT_LEVEL_INFO;
+  level_enum verbosity = level_enum::info;
 
   /**
    * Seed to the random number generator.
diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
deleted file mode 100644
index 77483e577d..0000000000
--- a/cpp/include/raft/common/logger.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * This file is deprecated and will be removed in release 22.08.
- * Please use the include/core/logger.hpp instead.
- */
-
-#pragma once
-
-#include <raft/core/logger.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
index 385bc544b0..1cd77ca665 100644
--- a/cpp/include/raft/common/nvtx.hpp
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -21,4 +21,4 @@
 
 #pragma once
 
-#include <raft/core/nvtx.hpp>
\ No newline at end of file
+#include <raft/core/nvtx.hpp>
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 667c8be285..8481360897 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -52,7 +52,7 @@ using std_comms = detail::std_comms;
  * #include <raft/core/device_mdarray.hpp>
  *
  * ncclComm_t nccl_comm;
- * raft::raft::resources handle;
+ * raft::resources handle;
  *
  * build_comms_nccl_only(&handle, nccl_comm, 5, 0);
  * ...
@@ -98,7 +98,7 @@ void build_comms_nccl_only(resources* handle, ncclComm_t nccl_comm, int num_rank
  * #include <raft/core/device_mdarray.hpp>
  *
  * ncclComm_t nccl_comm;
- * raft::raft::resources handle;
+ * raft::resources handle;
  * ucp_worker_h ucp_worker;
  * ucp_ep_h *ucp_endpoints_arr;
  *
diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index 024b1244a6..b2c9df436f 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -22,6 +22,7 @@
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/sparse/convert/csr.cuh>
 
 #include <type_traits>
 
@@ -42,4 +43,11 @@ _RAFT_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
   set(row * cols_ + col, new_value);
 }
 
+template <typename bitmap_t, typename index_t>
+template <typename csr_matrix_t>
+void bitmap_view<bitmap_t, index_t>::to_csr(const raft::resources& res, csr_matrix_t& csr) const
+{
+  raft::sparse::convert::bitmap_to_csr(res, *this, csr);
+}
+
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
index 86b2d77478..be305152e8 100644
--- a/cpp/include/raft/core/bitmap.hpp
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -53,9 +53,18 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    * @param bitmap_ptr Device raw pointer
    * @param rows Number of row in the matrix.
    * @param cols Number of col in the matrix.
+   * @param original_nbits Original number of bits used when the bitmap was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitmap was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr,
+                                index_t rows,
+                                index_t cols,
+                                index_t original_nbits = 0)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols, original_nbits),
+      rows_(rows),
+      cols_(cols)
   {
   }
 
@@ -65,11 +74,18 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    * @param bitmap_span Device vector view of the bitmap
    * @param rows Number of row in the matrix.
    * @param cols Number of col in the matrix.
+   * @param original_nbits Original number of bits used when the bitmap was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitmap was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
   _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
                                 index_t rows,
-                                index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
+                                index_t cols,
+                                index_t original_nbits = 0)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols, original_nbits),
+      rows_(rows),
+      cols_(cols)
   {
   }
 
@@ -117,6 +133,26 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    */
   inline _RAFT_HOST_DEVICE index_t get_n_cols() const { return cols_; }
 
+  /**
+   * @brief Converts to a Compressed Sparse Row (CSR) format matrix.
+   *
+   * This method transforms a two-dimensional bitmap matrix into a CSR representation,
+   * where each '1' bit in the bitmap corresponds to a non-zero entry in the CSR matrix.
+   * The bitmap is interpreted as a row-major matrix, with rows and columns defined by
+   * the dimensions of the bitmap.
+   *
+   * @tparam csr_matrix_t Specifies the CSR matrix type, constrained to raft::device_csr_matrix.
+   *
+   * @param[in] res RAFT resources for managing CUDA streams and execution policies.
+   * @param[out] csr Output parameter where the resulting CSR matrix is stored. Each '1' bit in
+   * the bitmap corresponds to a non-zero element in the CSR matrix.
+   *
+   * The caller must ensure that: The `csr` matrix is pre-allocated with dimensions and non-zero
+   * count matching the expected output.
+   */
+  template <typename csr_matrix_t>
+  void to_csr(const raft::resources& res, csr_matrix_t& csr) const;
+
  private:
   index_t rows_;
   index_t cols_;
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index d1bffdb81e..24ef3148b8 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -23,6 +23,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/reduce.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/popc.cuh>
 
@@ -32,12 +33,41 @@
 
 namespace raft::core {
 
+template <typename index_t>
+_RAFT_HOST_DEVICE void inline compute_original_nbits_position(const index_t original_nbits,
+                                                              const index_t nbits,
+                                                              const index_t sample_index,
+                                                              index_t& new_bit_index,
+                                                              index_t& new_bit_offset)
+{
+  const index_t original_bit_index  = sample_index / original_nbits;
+  const index_t original_bit_offset = sample_index % original_nbits;
+  new_bit_index                     = original_bit_index * original_nbits / nbits;
+  new_bit_offset                    = 0;
+  if (original_nbits > nbits) {
+    new_bit_index += original_bit_offset / nbits;
+    new_bit_offset = original_bit_offset % nbits;
+  } else {
+    index_t ratio = nbits / original_nbits;
+    new_bit_offset += (original_bit_index % ratio) * original_nbits;
+    new_bit_offset += original_bit_offset % nbits;
+  }
+}
+
 template <typename bitset_t, typename index_t>
 _RAFT_HOST_DEVICE inline bool bitset_view<bitset_t, index_t>::test(const index_t sample_index) const
 {
-  const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
-  const index_t bit_index    = sample_index % bitset_element_size;
-  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+  const index_t nbits = sizeof(bitset_t) * 8;
+  index_t bit_index   = 0;
+  index_t bit_offset  = 0;
+  if (original_nbits_ == 0 || nbits == original_nbits_) {
+    bit_index  = sample_index / bitset_element_size;
+    bit_offset = sample_index % bitset_element_size;
+  } else {
+    compute_original_nbits_position(original_nbits_, nbits, sample_index, bit_index, bit_offset);
+  }
+  const bitset_t bit_element = bitset_ptr_[bit_index];
+  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_offset)) != 0;
   return is_bit_set;
 }
 
@@ -51,14 +81,22 @@ template <typename bitset_t, typename index_t>
 _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
                                                       bool set_value) const
 {
-  const index_t bit_element = sample_index / bitset_element_size;
-  const index_t bit_index   = sample_index % bitset_element_size;
-  const bitset_t bitmask    = bitset_t{1} << bit_index;
+  const index_t nbits = sizeof(bitset_t) * 8;
+  index_t bit_index   = 0;
+  index_t bit_offset  = 0;
+
+  if (original_nbits_ == 0 || nbits == original_nbits_) {
+    bit_index  = sample_index / bitset_element_size;
+    bit_offset = sample_index % bitset_element_size;
+  } else {
+    compute_original_nbits_position(original_nbits_, nbits, sample_index, bit_index, bit_offset);
+  }
+  const bitset_t bitmask = bitset_t{1} << bit_offset;
   if (set_value) {
-    atomicOr(bitset_ptr_ + bit_element, bitmask);
+    atomicOr(bitset_ptr_ + bit_index, bitmask);
   } else {
     const bitset_t bitmask2 = ~bitmask;
-    atomicAnd(bitset_ptr_ + bit_element, bitmask2);
+    atomicAnd(bitset_ptr_ + bit_index, bitmask2);
   }
 }
 
@@ -165,6 +203,13 @@ double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) cons
   return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
 }
 
+template <typename bitset_t, typename index_t>
+template <typename csr_matrix_t>
+void bitset_view<bitset_t, index_t>::to_csr(const raft::resources& res, csr_matrix_t& csr) const
+{
+  raft::sparse::convert::bitset_to_csr(res, *this, csr);
+}
+
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   raft::device_vector_view<const index_t, index_t> mask_index,
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index be828def87..94113822fb 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -42,8 +42,20 @@ template <typename bitset_t = uint32_t, typename index_t = uint32_t>
 struct bitset_view {
   static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
 
-  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
-    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  /**
+   * @brief Create a bitset view from a device pointer to the bitset.
+   *
+   * @param bitset_ptr Device pointer to the bitset
+   * @param bitset_len Number of bits in the bitset
+   * @param original_nbits Original number of bits used when the bitset was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitset was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
+   */
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr,
+                                index_t bitset_len,
+                                index_t original_nbits = 0)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}, original_nbits_{original_nbits}
   {
   }
   /**
@@ -51,10 +63,17 @@ struct bitset_view {
    *
    * @param bitset_span Device vector view of the bitset
    * @param bitset_len Number of bits in the bitset
+   * @param original_nbits Original number of bits used when the bitset was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitset was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
   _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
-                                index_t bitset_len)
-    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+                                index_t bitset_len,
+                                index_t original_nbits = 0)
+    : bitset_ptr_{bitset_span.data_handle()},
+      bitset_len_{bitset_len},
+      original_nbits_{original_nbits}
   {
   }
   /**
@@ -180,9 +199,79 @@ struct bitset_view {
     return (bitset_len + bits_per_element - 1) / bits_per_element;
   }
 
+  /**
+   * @brief Get the original number of bits of the bitset.
+   */
+  auto get_original_nbits() const -> index_t { return original_nbits_; }
+  void set_original_nbits(index_t original_nbits) { original_nbits_ = original_nbits; }
+
+  /**
+   * @brief Converts to a Compressed Sparse Row (CSR) format matrix.
+   *
+   * This method transforms the bitset view into a CSR matrix representation, where each '1' bit in
+   * the bitset corresponds to a non-zero entry in the CSR matrix. The bitset format supports
+   * only a single-row matrix, so if the CSR matrix requires multiple rows, the bitset data is
+   * repeated for each row in the output.
+   *
+   * Example usage:
+   *
+   * @code{.cpp}
+   * #include <raft/core/resource/cuda_stream.hpp>
+   * #include <raft/sparse/convert/csr.cuh>
+   * #include <rmm/device_uvector.hpp>
+   *
+   * using bitset_t = uint32_t;
+   * using index_t  = int;
+   * using value_t  = float;
+   *
+   * raft::resources handle;
+   * auto stream    = resource::get_cuda_stream(handle);
+   * index_t n_rows = 3;
+   * index_t n_cols = 30;
+   *
+   * // Compute bitset size and initialize device memory
+   * index_t bitset_size = (n_cols + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+   * rmm::device_uvector<bitset_t> bitset_d(bitset_size, stream);
+   * std::vector<bitset_t> bitset_h = {
+   *   bitset_t(0b11001010),
+   * };  // Example bitset, with 4 non-zero entries.
+   *
+   * raft::copy(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+   *
+   * // Create bitset view and CSR matrix
+   * auto bitset_view = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), n_cols);
+   * auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, n_rows, n_cols, 4 * n_rows);
+   *
+   * // Convert bitset to CSR
+   * bitset_view.to_csr(handle, csr);
+   * resource::sync_stream(handle);
+   *
+   * // Results:
+   * // csr.indptr  = [0, 4, 8, 12];
+   * // csr.indices = [1, 3, 6, 7,
+   * //                1, 3, 6, 7,
+   * //                1, 3, 6, 7];
+   * // csr.values  = [1, 1, 1, 1,
+   * //                1, 1, 1, 1,
+   * //                1, 1, 1, 1];
+   * @endcode
+   *
+   * @tparam csr_matrix_t Specifies the CSR matrix type, constrained to raft::device_csr_matrix.
+   *
+   * @param[in] res RAFT resources for managing CUDA streams and execution policies.
+   * @param[out] csr Output parameter where the resulting CSR matrix is stored. Each '1' bit in
+   * the bitset corresponds to a non-zero element in the CSR matrix.
+   *
+   * The caller must ensure that: The `csr` matrix is pre-allocated with dimensions and non-zero
+   * count matching the expected output, i.e., `nnz_for_csr = nnz_for_bitset * n_rows`.
+   */
+  template <typename csr_matrix_t>
+  void to_csr(const raft::resources& res, csr_matrix_t& csr) const;
+
  private:
   bitset_t* bitset_ptr_;
   index_t bitset_len_;
+  index_t original_nbits_;
 };
 
 /**
diff --git a/cpp/include/raft/core/coo_matrix.hpp b/cpp/include/raft/core/coo_matrix.hpp
index 52ac69f163..b812e28206 100644
--- a/cpp/include/raft/core/coo_matrix.hpp
+++ b/cpp/include/raft/core/coo_matrix.hpp
@@ -297,4 +297,4 @@ class coo_matrix
 
 /** @} */
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/core/csr_matrix.hpp b/cpp/include/raft/core/csr_matrix.hpp
index 1113cc2023..4f7679bbae 100644
--- a/cpp/include/raft/core/csr_matrix.hpp
+++ b/cpp/include/raft/core/csr_matrix.hpp
@@ -309,4 +309,4 @@ class csr_matrix
 
 /** @} */
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/core/cublas_macros.hpp b/cpp/include/raft/core/cublas_macros.hpp
index b69b121161..6c195d8a6f 100644
--- a/cpp/include/raft/core/cublas_macros.hpp
+++ b/cpp/include/raft/core/cublas_macros.hpp
@@ -23,9 +23,6 @@
 
 #include <cublas_v2.h>
 
-///@todo: enable this once we have logger enabled
-// #include <cuml/common/logger.hpp>
-
 #include <cstdint>
 
 #define _CUBLAS_ERR_TO_STR(err) \
diff --git a/cpp/include/raft/core/cusolver_macros.hpp b/cpp/include/raft/core/cusolver_macros.hpp
index 74a8b7c36c..beaf2d74dc 100644
--- a/cpp/include/raft/core/cusolver_macros.hpp
+++ b/cpp/include/raft/core/cusolver_macros.hpp
@@ -19,11 +19,10 @@
 
 #pragma once
 
+#include <raft/util/cudart_utils.hpp>
+
 #include <cusolverDn.h>
 #include <cusolverSp.h>
-///@todo: enable this once logging is enabled
-// #include <cuml/common/logger.hpp>
-#include <raft/util/cudart_utils.hpp>
 
 #include <type_traits>
 
@@ -135,4 +134,4 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 #define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/core/cusparse_macros.hpp b/cpp/include/raft/core/cusparse_macros.hpp
index 5a1968b529..2a1df14345 100644
--- a/cpp/include/raft/core/cusparse_macros.hpp
+++ b/cpp/include/raft/core/cusparse_macros.hpp
@@ -19,8 +19,6 @@
 #include <raft/core/error.hpp>
 
 #include <cusparse.h>
-///@todo: enable this once logging is enabled
-// #include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
   case err: return #err;
diff --git a/cpp/include/raft/core/detail/callback_sink.hpp b/cpp/include/raft/core/detail/callback_sink.hpp
deleted file mode 100644
index a110af5c76..0000000000
--- a/cpp/include/raft/core/detail/callback_sink.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <iostream>
-#include <mutex>
-
-#define SPDLOG_HEADER_ONLY
-#include <spdlog/common.h>
-#include <spdlog/details/log_msg.h>
-#include <spdlog/sinks/base_sink.h>
-
-namespace spdlog::sinks {
-
-typedef void (*LogCallback)(int lvl, const char* msg);
-
-template <class Mutex>
-class CallbackSink : public base_sink<Mutex> {
- public:
-  explicit CallbackSink(std::string tag      = "spdlog",
-                        LogCallback callback = nullptr,
-                        void (*flush)()      = nullptr)
-    : _callback{callback}, _flush{flush} {};
-
-  void set_callback(LogCallback callback) { _callback = callback; }
-  void set_flush(void (*flush)()) { _flush = flush; }
-
- protected:
-  void sink_it_(const details::log_msg& msg) override
-  {
-    spdlog::memory_buf_t formatted;
-    base_sink<Mutex>::formatter_->format(msg, formatted);
-    std::string msg_string = fmt::to_string(formatted);
-
-    if (_callback) {
-      _callback(static_cast<int>(msg.level), msg_string.c_str());
-    } else {
-      std::cout << msg_string;
-    }
-  }
-
-  void flush_() override
-  {
-    if (_flush) {
-      _flush();
-    } else {
-      std::cout << std::flush;
-    }
-  }
-
-  LogCallback _callback;
-  void (*_flush)();
-};
-
-using callback_sink_mt = CallbackSink<std::mutex>;
-using callback_sink_st = CallbackSink<details::null_mutex>;
-
-}  // end namespace spdlog::sinks
diff --git a/cpp/include/raft/core/detail/fail_container_policy.hpp b/cpp/include/raft/core/detail/fail_container_policy.hpp
index cf9d0887dd..f5f1bfb377 100644
--- a/cpp/include/raft/core/detail/fail_container_policy.hpp
+++ b/cpp/include/raft/core/detail/fail_container_policy.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/core/error.hpp>
-#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/thirdparty/mdspan/include/experimental/mdspan>
 
diff --git a/cpp/include/raft/core/detail/logger.hpp b/cpp/include/raft/core/detail/logger.hpp
deleted file mode 100644
index f3f52b46ae..0000000000
--- a/cpp/include/raft/core/detail/logger.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
-#pragma message(__FILE__                                                   \
-                  " is deprecated and will be removed in future releases." \
-                  " Please use the <raft/core/logger.hpp> version instead.")
-#endif
-
-#include <raft/core/logger.hpp>
diff --git a/cpp/include/raft/core/detail/mdspan_util.cuh b/cpp/include/raft/core/detail/mdspan_util.cuh
index ded95c2f31..d3438bc07d 100644
--- a/cpp/include/raft/core/detail/mdspan_util.cuh
+++ b/cpp/include/raft/core/detail/mdspan_util.cuh
@@ -67,4 +67,4 @@ MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t
 #endif  // compiler
 }
 
-}  // end namespace raft::detail
\ No newline at end of file
+}  // end namespace raft::detail
diff --git a/cpp/include/raft/core/device_coo_matrix.hpp b/cpp/include/raft/core/device_coo_matrix.hpp
index 41da605ff0..4ed67d5fc5 100644
--- a/cpp/include/raft/core/device_coo_matrix.hpp
+++ b/cpp/include/raft/core/device_coo_matrix.hpp
@@ -395,4 +395,4 @@ auto make_device_coordinate_structure_view(raft::device_span<RowType> rows,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/device_csr_matrix.hpp b/cpp/include/raft/core/device_csr_matrix.hpp
index 1d23c8912d..b0dbfa000d 100644
--- a/cpp/include/raft/core/device_csr_matrix.hpp
+++ b/cpp/include/raft/core/device_csr_matrix.hpp
@@ -422,4 +422,4 @@ auto make_device_compressed_structure_view(raft::device_span<IndptrType> indptr,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/device_span.hpp b/cpp/include/raft/core/device_span.hpp
index d3350b5e3a..abf72b6b2e 100644
--- a/cpp/include/raft/core/device_span.hpp
+++ b/cpp/include/raft/core/device_span.hpp
@@ -34,4 +34,4 @@ using device_span = span<T, true, extent>;
 /**
  * @}
  */
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/core/host_coo_matrix.hpp b/cpp/include/raft/core/host_coo_matrix.hpp
index 7a216dc8a2..e0f95d2a77 100644
--- a/cpp/include/raft/core/host_coo_matrix.hpp
+++ b/cpp/include/raft/core/host_coo_matrix.hpp
@@ -393,4 +393,4 @@ auto make_host_coordinate_structure_view(raft::host_span<RowType> rows,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/host_csr_matrix.hpp b/cpp/include/raft/core/host_csr_matrix.hpp
index e3cea3cd27..8a29d957f6 100644
--- a/cpp/include/raft/core/host_csr_matrix.hpp
+++ b/cpp/include/raft/core/host_csr_matrix.hpp
@@ -423,4 +423,4 @@ auto make_host_compressed_structure_view(raft::host_span<IndptrType> indptr,
 
 /** @} */
 
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp
index 3020cde32d..229619999d 100644
--- a/cpp/include/raft/core/host_mdarray.hpp
+++ b/cpp/include/raft/core/host_mdarray.hpp
@@ -253,4 +253,4 @@ auto make_host_vector(IndexType n)
   return make_host_mdarray<ElementType, IndexType, LayoutPolicy>(make_extents<IndexType>(n));
 }
 
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/core/host_span.hpp b/cpp/include/raft/core/host_span.hpp
index 36978dfca4..d31f8b4c30 100644
--- a/cpp/include/raft/core/host_span.hpp
+++ b/cpp/include/raft/core/host_span.hpp
@@ -35,4 +35,4 @@ using host_span = span<T, false, extent>;
  * @}
  */
 
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/core/logger-ext.hpp b/cpp/include/raft/core/logger-ext.hpp
deleted file mode 100644
index 73fe463aba..0000000000
--- a/cpp/include/raft/core/logger-ext.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
-
-#include <memory>         // std::unique_ptr
-#include <string>         // std::string
-#include <unordered_map>  // std::unordered_map
-
-namespace raft {
-
-static const std::string RAFT_NAME = "raft";
-static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
-
-namespace detail {
-RAFT_INLINE_CONDITIONAL std::string format(const char* fmt, ...);
-}
-/**
- * @brief The main Logging class for raft library.
- *
- * This class acts as a thin wrapper over the underlying `spdlog` interface. The
- * design is done in this way in order to avoid us having to also ship `spdlog`
- * header files in our installation.
- *
- * @todo This currently only supports logging to stdout. Need to add support in
- *       future to add custom loggers as well [Issue #2046]
- */
-class logger {
- public:
-  // @todo setting the logger once per process with
-  logger(std::string const& name_ = "");
-  /**
-   * @brief Singleton method to get the underlying logger object
-   *
-   * @return the singleton logger object
-   */
-  static logger& get(std::string const& name = "");
-
-  /**
-   * @brief Set the logging level.
-   *
-   * Only messages with level equal or above this will be printed
-   *
-   * @param[in] level logging level
-   *
-   * @note The log level will actually be set only if the input is within the
-   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-   *       be ignored. See documentation of decisiontree for how this gets used
-   */
-  void set_level(int level);
-
-  /**
-   * @brief Set the logging pattern
-   *
-   * @param[in] pattern the pattern to be set. Refer this link
-   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-   *                    to know the right syntax of this pattern
-   */
-  void set_pattern(const std::string& pattern);
-
-  /**
-   * @brief Register a callback function to be run in place of usual log call
-   *
-   * @param[in] callback the function to be run on all logged messages
-   */
-  void set_callback(void (*callback)(int lvl, const char* msg));
-
-  /**
-   * @brief Register a flush function compatible with the registered callback
-   *
-   * @param[in] flush the function to use when flushing logs
-   */
-  void set_flush(void (*flush)());
-
-  /**
-   * @brief Tells whether messages will be logged for the given log level
-   *
-   * @param[in] level log level to be checked for
-   * @return true if messages will be logged for this level, else false
-   */
-  bool should_log_for(int level) const;
-  /**
-   * @brief Query for the current log level
-   *
-   * @return the current log level
-   */
-  int get_level() const;
-
-  /**
-   * @brief Get the current logging pattern
-   * @return the pattern
-   */
-  std::string get_pattern() const;
-
-  /**
-   * @brief Main logging method
-   *
-   * @param[in] level logging level of this message
-   * @param[in] fmt   C-like format string, followed by respective params
-   */
-  void log(int level, const char* fmt, ...);
-
-  /**
-   * @brief Flush logs by calling flush on underlying logger
-   */
-  void flush();
-
-  ~logger();
-
- private:
-  logger();
-  // pimpl pattern:
-  // https://learn.microsoft.com/en-us/cpp/cpp/pimpl-for-compile-time-encapsulation-modern-cpp?view=msvc-170
-  class impl;
-  std::unique_ptr<impl> pimpl;
-  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-};  // class logger
-
-/**
- * @brief An object used for scoped log level setting
- *
- * Instances of `raft::log_level_setter` will set RAFT logging to the level
- * indicated on construction and will revert to the previous set level on
- * destruction.
- */
-struct log_level_setter {
-  explicit log_level_setter(int level)
-  {
-    prev_level_ = logger::get(RAFT_NAME).get_level();
-    logger::get(RAFT_NAME).set_level(level);
-  }
-  ~log_level_setter() { logger::get(RAFT_NAME).set_level(prev_level_); }
-
- private:
-  int prev_level_;
-};  // class log_level_setter
-
-};  // namespace raft
diff --git a/cpp/include/raft/core/logger-inl.hpp b/cpp/include/raft/core/logger-inl.hpp
deleted file mode 100644
index ea5f4ea26e..0000000000
--- a/cpp/include/raft/core/logger-inl.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "logger-macros.hpp"
-
-#include <stdarg.h>
-
-#include <algorithm>
-#include <memory>
-#include <mutex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-// The logger-ext.hpp file contains the class declaration of the logger class.
-// In this case, it is okay to include the logger-ext.hpp file because it
-// contains no RAFT_EXPLICIT template instantiations.
-#include "logger-ext.hpp"
-
-#define SPDLOG_HEADER_ONLY
-#include <raft/core/detail/callback_sink.hpp>
-#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
-
-#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
-#include <spdlog/spdlog.h>                    // NOLINT
-
-namespace raft {
-
-namespace detail {
-
-inline std::string format(const char* fmt, va_list& vl)
-{
-  va_list vl_copy;
-  va_copy(vl_copy, vl);
-  int length = std::vsnprintf(nullptr, 0, fmt, vl_copy);
-  assert(length >= 0);
-  std::vector<char> buf(length + 1);
-  std::vsnprintf(buf.data(), length + 1, fmt, vl);
-  return std::string(buf.data());
-}
-
-RAFT_INLINE_CONDITIONAL std::string format(const char* fmt, ...)
-{
-  va_list vl;
-  va_start(vl, fmt);
-  std::string str = format(fmt, vl);
-  va_end(vl);
-  return str;
-}
-
-inline int convert_level_to_spdlog(int level)
-{
-  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
-  return RAFT_LEVEL_TRACE - level;
-}
-
-}  // namespace detail
-
-class logger::impl {  // defined privately here
-                      // ... all private data and functions: all of these
-                      //     can now change without recompiling callers ...
- public:
-  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-  std::shared_ptr<spdlog::logger> spdlogger;
-  std::string cur_pattern;
-  int cur_level;
-
-  impl(std::string const& name_ = "")
-    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-      cur_pattern()
-  {
-  }
-};  // class logger::impl
-
-RAFT_INLINE_CONDITIONAL logger::logger(std::string const& name_) : pimpl(new impl(name_))
-{
-  set_pattern(default_log_pattern);
-  set_level(RAFT_ACTIVE_LEVEL);
-}
-
-RAFT_INLINE_CONDITIONAL logger& logger::get(std::string const& name)
-{
-  if (log_map.find(name) == log_map.end()) { log_map[name] = std::make_shared<raft::logger>(name); }
-  return *log_map[name];
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_level(int level)
-{
-  level = raft::detail::convert_level_to_spdlog(level);
-  pimpl->spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_pattern(const std::string& pattern)
-{
-  pimpl->cur_pattern = pattern;
-  pimpl->spdlogger->set_pattern(pattern);
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_callback(void (*callback)(int lvl, const char* msg))
-{
-  pimpl->sink->set_callback(callback);
-}
-
-RAFT_INLINE_CONDITIONAL void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
-
-RAFT_INLINE_CONDITIONAL bool logger::should_log_for(int level) const
-{
-  level        = raft::detail::convert_level_to_spdlog(level);
-  auto level_e = static_cast<spdlog::level::level_enum>(level);
-  return pimpl->spdlogger->should_log(level_e);
-}
-
-RAFT_INLINE_CONDITIONAL int logger::get_level() const
-{
-  auto level_e = pimpl->spdlogger->level();
-  return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-}
-
-RAFT_INLINE_CONDITIONAL std::string logger::get_pattern() const { return pimpl->cur_pattern; }
-
-RAFT_INLINE_CONDITIONAL void logger::log(int level, const char* fmt, ...)
-{
-  level        = raft::detail::convert_level_to_spdlog(level);
-  auto level_e = static_cast<spdlog::level::level_enum>(level);
-  // explicit check to make sure that we only expand messages when required
-  if (pimpl->spdlogger->should_log(level_e)) {
-    va_list vl;
-    va_start(vl, fmt);
-    auto msg = raft::detail::format(fmt, vl);
-    va_end(vl);
-    pimpl->spdlogger->log(level_e, msg);
-  }
-}
-
-RAFT_INLINE_CONDITIONAL void logger::flush() { pimpl->spdlogger->flush(); }
-
-RAFT_INLINE_CONDITIONAL logger::~logger() {}
-
-};  // namespace raft
diff --git a/cpp/include/raft/core/logger-macros.hpp b/cpp/include/raft/core/logger-macros.hpp
index 5ddb072067..e32440dcce 100644
--- a/cpp/include/raft/core/logger-macros.hpp
+++ b/cpp/include/raft/core/logger-macros.hpp
@@ -15,92 +15,17 @@
  */
 #pragma once
 
-/**
- * @defgroup logging levels used in raft
- *
- * @note exactly match the corresponding ones (but reverse in terms of value)
- *       in spdlog for wrapping purposes
- *
- * @{
- */
-#define RAFT_LEVEL_TRACE    6
-#define RAFT_LEVEL_DEBUG    5
-#define RAFT_LEVEL_INFO     4
-#define RAFT_LEVEL_WARN     3
-#define RAFT_LEVEL_ERROR    2
-#define RAFT_LEVEL_CRITICAL 1
-#define RAFT_LEVEL_OFF      0
-/** @} */
-
-#if !defined(RAFT_ACTIVE_LEVEL)
-#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_INFO
-#endif
-
-/**
- * @defgroup loggerMacros Helper macros for dealing with logging
- * @{
- */
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE_VEC(ptr, len)                                      \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    print_vector(#ptr, ptr, len, ss);                                     \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+#include <sstream>
+
+#if (RAFT_LOG_ACTIVE_LEVEL <= RAFT_LOG_LEVEL_TRACE)
+#define RAFT_LOG_TRACE_VEC(ptr, len)                                               \
+  do {                                                                             \
+    std::stringstream ss;                                                          \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);                      \
+    print_vector(#ptr, ptr, len, ss);                                              \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str());          \
+    RAFT_LOGGER_CALL(raft::default_logger(), raft::level_enum::trace, __VA_ARGS__) \
   } while (0)
 #else
 #define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
 #endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
-#define RAFT_LOG_DEBUG(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_DEBUG(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
-#define RAFT_LOG_INFO(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_INFO(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
-#define RAFT_LOG_WARN(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_WARN(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
-#define RAFT_LOG_ERROR(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_ERROR(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
-#define RAFT_LOG_CRITICAL(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
-#endif
-/** @} */
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
deleted file mode 100644
index e64a0db257..0000000000
--- a/cpp/include/raft/core/logger.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include "logger-ext.hpp"
-#include "logger-macros.hpp"
-
-#if !defined(RAFT_COMPILED)
-#include "logger-inl.hpp"
-#endif
diff --git a/cpp/include/raft/core/resource/device_id.hpp b/cpp/include/raft/core/resource/device_id.hpp
index 570d815780..a371f9ddde 100644
--- a/cpp/include/raft/core/resource/device_id.hpp
+++ b/cpp/include/raft/core/resource/device_id.hpp
@@ -73,4 +73,4 @@ inline int get_device_id(resources const& res)
 /**
  * @}
  */
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/device_properties.hpp b/cpp/include/raft/core/resource/device_properties.hpp
index a87c29f709..7ac780ef16 100644
--- a/cpp/include/raft/core/resource/device_properties.hpp
+++ b/cpp/include/raft/core/resource/device_properties.hpp
@@ -75,4 +75,4 @@ inline cudaDeviceProp& get_device_properties(resources const& res)
 /**
  * @}
  */
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/sub_comms.hpp b/cpp/include/raft/core/resource/sub_comms.hpp
index 11d2aed1e0..b4fef75d57 100644
--- a/cpp/include/raft/core/resource/sub_comms.hpp
+++ b/cpp/include/raft/core/resource/sub_comms.hpp
@@ -79,4 +79,4 @@ inline void set_subcomm(resources const& res,
  * @}
  */
 
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
index 55da3037a9..6e5092f50f 100644
--- a/cpp/include/raft/core/sparse_types.hpp
+++ b/cpp/include/raft/core/sparse_types.hpp
@@ -222,4 +222,4 @@ class sparse_matrix {
 
 /* @} */
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h b/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h
index 42de4860a0..56cce4de8b 100644
--- a/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h
+++ b/cpp/include/raft/distance/detail/fused_distance_nn/gemm.h
@@ -406,4 +406,4 @@ struct FusedDistanceNNGemm<double,
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
\ No newline at end of file
+}  // namespace cutlass
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
index aaf2689dab..cc85a918a3 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
+++ b/cpp/include/raft/distance/detail/pairwise_distance_gemm.h
@@ -235,4 +235,4 @@ struct PairwiseDistanceGemm<double,
 
 }  // namespace kernel
 }  // namespace gemm
-}  // namespace cutlass
\ No newline at end of file
+}  // namespace cutlass
diff --git a/cpp/include/raft/distance/distance-inl.cuh b/cpp/include/raft/distance/distance-inl.cuh
index 13c9d57efd..d5f8d1cfe1 100644
--- a/cpp/include/raft/distance/distance-inl.cuh
+++ b/cpp/include/raft/distance/distance-inl.cuh
@@ -366,7 +366,7 @@ void pairwise_distance(raft::resources const& handle,
  * #include <raft/random/make_blobs.cuh>
  * #include <raft/distance/distance.cuh>
  *
- * raft::raft::resources handle;
+ * raft::resources handle;
  * int n_samples = 5000;
  * int n_features = 50;
  *
diff --git a/cpp/include/raft/distance/fused_distance_nn.cuh b/cpp/include/raft/distance/fused_distance_nn.cuh
index 25b1ae01ea..aa20bfeaf1 100755
--- a/cpp/include/raft/distance/fused_distance_nn.cuh
+++ b/cpp/include/raft/distance/fused_distance_nn.cuh
@@ -15,4 +15,4 @@
  */
 #pragma once
 
-#include "fused_distance_nn-inl.cuh"
\ No newline at end of file
+#include "fused_distance_nn-inl.cuh"
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 93c1080ff2..c539419738 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -118,4 +118,4 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer
 };  // namespace label
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
index 5513f16b9f..891bc9313a 100644
--- a/cpp/include/raft/label/detail/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -155,4 +155,4 @@ void merge_labels(value_idx* labels_a,
 
 }  // namespace detail
 };  // namespace label
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index 2bf2fa830b..370b6b8996 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -68,4 +68,4 @@ void merge_labels(value_idx* labels_a,
 };  // namespace label
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 292140b4dc..e938626b20 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -139,4 +139,4 @@ void choleskyRank1Update(raft::resources const& handle,
 };  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index a4247e618f..b377bad101 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -163,4 +163,4 @@ void coalesced_reduction(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 121ac10e24..0227fea4a4 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -60,4 +60,4 @@ void addDevScalar(
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index d23c7d60a6..2b9a7ba485 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -99,4 +99,4 @@ void divide_scalar(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 9f03f54f9a..7245d31191 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -223,4 +223,4 @@ void eig_jacobi(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 2e6c1a4ab5..569845d488 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -97,4 +97,4 @@ void eltwiseDivideCheckZero(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
index 31bad62930..6b33561f48 100644
--- a/cpp/include/raft/linalg/gemv.cuh
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -307,4 +307,4 @@ void gemv(raft::resources const& handle,
 
 };  // namespace linalg
 };  // namespace raft
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/linalg_types.hpp b/cpp/include/raft/linalg/linalg_types.hpp
index 9c81fbc177..aa3e402988 100644
--- a/cpp/include/raft/linalg/linalg_types.hpp
+++ b/cpp/include/raft/linalg/linalg_types.hpp
@@ -39,4 +39,4 @@ enum class FillMode { UPPER, LOWER };
  */
 enum class Operation { NON_TRANSPOSE, TRANSPOSE };
 
-}  // end namespace raft::linalg
\ No newline at end of file
+}  // end namespace raft::linalg
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
index 21575d7806..5188e69268 100644
--- a/cpp/include/raft/linalg/lstsq.cuh
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -248,4 +248,4 @@ void lstsq_qr(raft::resources const& handle,
 };  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh
index 1886c941b9..505aade1cf 100644
--- a/cpp/include/raft/linalg/map_reduce.cuh
+++ b/cpp/include/raft/linalg/map_reduce.cuh
@@ -115,4 +115,4 @@ void map_reduce(raft::resources const& handle,
 
 }  // end namespace raft::linalg
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index a69ac6df36..f4ab356f1c 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -91,4 +91,4 @@ template <typename InType,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
index 85805c287a..ab8a3dbd04 100644
--- a/cpp/include/raft/linalg/matrix_vector.cuh
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -200,4 +200,4 @@ void binary_sub(raft::resources const& handle,
 
 /** @} */  // end of matrix_vector
 
-}  // namespace raft::linalg
\ No newline at end of file
+}  // namespace raft::linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 1a7668f8f2..f01af3b700 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -101,4 +101,4 @@ void multiply_scalar(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index 5c7dcbd5cf..5f319a7537 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -157,4 +157,4 @@ void power_scalar(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index b032cbfa3a..ce07baea1f 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -123,4 +123,4 @@ void qr_get_qr(raft::resources const& handle,
 };  // namespace linalg
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 8fd6e45d37..7f9ec0c197 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -170,4 +170,4 @@ void reduce(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 77ad8a9a80..e0f6fe257c 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -116,4 +116,4 @@ void reduce_cols_by_key(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 2bb14729f4..edb325acc1 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -195,4 +195,4 @@ void reduce_rows_by_key(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index a90ba165ed..404e8e2dd9 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -880,4 +880,4 @@ void randomized_svd(const raft::resources& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index 81b7ab7dec..2dfa1ccd91 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -87,4 +87,4 @@ void sqrt(raft::resources const& handle, InType in, OutType out)
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index ac97c3cd68..c283d5721a 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -174,4 +174,4 @@ void strided_reduction(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index e7b5c6b65a..dc59d955ad 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -226,4 +226,4 @@ void subtract_scalar(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 1c57515a47..11f700c2a9 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -420,4 +420,4 @@ void svd_reconstruction(raft::resources const& handle,
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index c94b2506d3..9e3b806702 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -136,4 +136,4 @@ void sort_cols_per_row(Args... args)
 
 };  // end namespace raft::matrix
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh
index a37ba550f9..6fa6ce4aee 100644
--- a/cpp/include/raft/matrix/detail/gather_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh
@@ -114,4 +114,4 @@ void gather(raft::resources const& handle,
 
 }  // namespace detail
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
index 6a60e5006b..c00d212c10 100644
--- a/cpp/include/raft/matrix/detail/scatter_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
@@ -126,4 +126,4 @@ void scatter(raft::resources const& handle,
 
 }  // end namespace detail
 }  // end namespace matrix
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index 6ed9a0d358..10a9f66ae3 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -20,4 +20,4 @@
 
 #pragma once
 
-#include "math.cuh"
\ No newline at end of file
+#include "math.cuh"
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index ecfdb19191..8397f94a8d 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -41,4 +41,4 @@ m_t l2_norm(raft::resources const& handle, raft::device_mdspan<const m_t, idx_t>
 
 /** @} */  // end of group matrix_norm
 
-}  // namespace raft::matrix
\ No newline at end of file
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 42057bb0f5..c10fa8f5f0 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -69,4 +69,4 @@ void row_reverse(raft::resources const& handle,
 }
 /** @} */  // end group matrix_reverse
 
-}  // namespace raft::matrix
\ No newline at end of file
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/scatter.cuh b/cpp/include/raft/matrix/scatter.cuh
index cd2d76a863..072f0c18ac 100644
--- a/cpp/include/raft/matrix/scatter.cuh
+++ b/cpp/include/raft/matrix/scatter.cuh
@@ -55,4 +55,4 @@ void scatter(raft::resources const& handle,
   detail::scatter(handle, inout, map, col_batch_size);
 }
 
-}  // namespace raft::matrix
\ No newline at end of file
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
index c922a0d7f4..caff6ea341 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
@@ -228,4 +228,4 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   }
 };
 
-}  // namespace raft::neighbors::cagra::detail
\ No newline at end of file
+}  // namespace raft::neighbors::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/div_utils.hpp b/cpp/include/raft/neighbors/detail/div_utils.hpp
index 0455d0ec9b..4dd7b66d46 100644
--- a/cpp/include/raft/neighbors/detail/div_utils.hpp
+++ b/cpp/include/raft/neighbors/detail/div_utils.hpp
@@ -63,4 +63,4 @@ struct div_utils {
 #endif
   }
 };
-}  // namespace raft::neighbors::detail
\ No newline at end of file
+}  // namespace raft::neighbors::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index 55184cc615..0e00ef571f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cluster/kmeans_balanced.cuh>
+#include <raft/core/logger-macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/nvtx.hpp>
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 388dd60f14..44d55c36de 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <raft/core/logger.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>                              // raft::resources
 #include <raft/distance/distance_types.hpp>                     // is_min_close, DistanceType
diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh
index 02610f9afb..64e4a3ea7a 100644
--- a/cpp/include/raft/neighbors/detail/nn_descent.cuh
+++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -709,7 +709,8 @@ template <typename Index_t,
           typename epilogue_op = DistEpilogue<Index_t, DistData_t>>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
-#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890)
+#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890) || \
+  (__CUDA_ARCH__) == 1200
 __launch_bounds__(BLOCK_SIZE)
 #else
 __launch_bounds__(BLOCK_SIZE, 4)
diff --git a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
index bade4385fb..c2f531263d 100644
--- a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh
@@ -76,7 +76,7 @@ void epsUnexpL2SqNeighborhood(bool* adj,
  *  #include <raft/core/resources.hpp>
  *  #include <raft/core/device_mdarray.hpp>
  *  using namespace raft::neighbors;
- *  raft::raft::resources handle;
+ *  raft::resources handle;
  *  ...
  *  auto adj = raft::make_device_matrix<bool>(handle, m * n);
  *  auto vd = raft::make_device_vector<int>(handle, m+1);
@@ -120,4 +120,4 @@ void eps_neighbors_l2sq(raft::resources const& handle,
 
 }  // namespace raft::neighbors::epsilon_neighborhood
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
index 5379788ab4..db03d78105 100644
--- a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp
@@ -87,4 +87,4 @@ _RAFT_HOST_DEVICE void unpack_1(
     }
   }
 }
-}  // namespace raft::neighbors::ivf_flat::codepacker
\ No newline at end of file
+}  // namespace raft::neighbors::ivf_flat::codepacker
diff --git a/cpp/include/raft/random/detail/curand_wrappers.hpp b/cpp/include/raft/random/detail/curand_wrappers.hpp
index 969d739cc1..d62e64d532 100644
--- a/cpp/include/raft/random/detail/curand_wrappers.hpp
+++ b/cpp/include/raft/random/detail/curand_wrappers.hpp
@@ -54,4 +54,4 @@ inline curandStatus_t curandGenerateNormal(
 /** @} */
 
 };  // end namespace detail
-};  // end namespace raft::random
\ No newline at end of file
+};  // end namespace raft::random
diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh
index 37caa51ad3..b1c56afa0c 100644
--- a/cpp/include/raft/random/detail/permute.cuh
+++ b/cpp/include/raft/random/detail/permute.cuh
@@ -161,4 +161,4 @@ void permute(IntType* perms,
 }
 
 };  // end namespace detail
-};  // end namespace raft::random
\ No newline at end of file
+};  // end namespace raft::random
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index 9ad7c68f87..12c01fc5d7 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -54,8 +54,8 @@ DI void gen_and_update_bits(IdxT& src_id,
   } else {
     src_bit = dst_bit = true;
   }
-  if (curr_depth < r_scale) { src_id |= (IdxT(src_bit) << (r_scale - curr_depth - 1)); }
-  if (curr_depth < c_scale) { dst_id |= (IdxT(dst_bit) << (c_scale - curr_depth - 1)); }
+  if (curr_depth < r_scale) { src_id |= (IdxT(src_bit) << (curr_depth)); }
+  if (curr_depth < c_scale) { dst_id |= (IdxT(dst_bit) << (curr_depth)); }
 }
 
 template <typename IdxT>
@@ -151,15 +151,16 @@ RAFT_KERNEL rmat_gen_kernel(IdxT* out,
   raft::random::PCGenerator gen{r.seed, r.base_subsequence + idx, 0};
   auto min_scale = min(r_scale, c_scale);
   IdxT i         = 0;
-  for (; i < min_scale; ++i) {
-    gen_and_update_bits(src_id, dst_id, a, a + b, a + b + c, r_scale, c_scale, i, gen);
-  }
-  for (; i < r_scale; ++i) {
-    gen_and_update_bits(src_id, dst_id, a + b, a + b, ProbT(1), r_scale, c_scale, i, gen);
-  }
-  for (; i < c_scale; ++i) {
-    gen_and_update_bits(src_id, dst_id, a + c, ProbT(1), ProbT(1), r_scale, c_scale, i, gen);
+  // Whether we have more rows than columns.
+  const bool more_rows = r_scale > c_scale;
+
+  for (; i < max_scale; ++i) {
+    ProbT A   = (i < min_scale) ? a : (more_rows ? a + b : a + c);
+    ProbT AB  = (i < min_scale) ? a + b : (more_rows ? a + b : ProbT(1));
+    ProbT ABC = (i < min_scale) ? a + b + c : ProbT(1);
+    gen_and_update_bits(src_id, dst_id, A, AB, ABC, r_scale, c_scale, i, gen);
   }
+
   store_ids(out, out_src, out_dst, src_id, dst_id, idx, n_edges);
 }
 
diff --git a/cpp/include/raft/random/device/sample.cuh b/cpp/include/raft/random/device/sample.cuh
index d0e5200185..67b98f12fe 100644
--- a/cpp/include/raft/random/device/sample.cuh
+++ b/cpp/include/raft/random/device/sample.cuh
@@ -27,12 +27,14 @@ namespace raft::random::device {
 
 /**
  * @brief warp-level random sampling of an index.
+ *
  * It selects an index with the given discrete probability
- * distribution(represented by weights of each index)
+ * distribution(represented by weights of each index).
+ * Only thread 0 will contain the valid reduced result.
+ *
  * @param rng random number generator, must have next_u32() function
  * @param weight weight of the rank/index.
  * @param idx index to be used as rank
- * @return only the thread0 will contain valid reduced result
  */
 template <typename T, typename rng_t, typename i_t = int>
 DI void warp_random_sample(rng_t& rng, T& weight, i_t& idx)
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 4fd1f44f64..296b7ab283 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -187,4 +187,4 @@ void make_blobs(
 
 }  // end namespace raft::random
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/random/rmat_rectangular_generator.cuh b/cpp/include/raft/random/rmat_rectangular_generator.cuh
index 5598b25c8e..cdd89f40dd 100644
--- a/cpp/include/raft/random/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/rmat_rectangular_generator.cuh
@@ -30,8 +30,18 @@ namespace raft::random {
 /**
  * @brief Generate a bipartite RMAT graph for a rectangular adjacency matrix.
  *
- * This is the most general of several overloads of `rmat_rectangular_gen`
- * in this file, and thus has the most detailed documentation.
+ * This function generates a random graph represented by a (sparse) adjacency matrix. As described
+ * in [1], to generate connections, we recursively subdivide the adjacency matrix into four
+ * equal-sized partitions, and distribute edges within these partitions with a unequal
+ * probabilities. The probabilities are described by numbers [a, b, c, d]. We chose the upper left
+ * partition with probability `a`. The chosen partition is again subdivided into four smaller
+ * partitions, and the procedure is repeated until we reach a single element (1 x 1 partition).
+ *
+ * We can prescribe different probability distribution at each iteariton. The `theta` array stores
+ * the probability values for each level.
+ *
+ * [1] "R-MAT: A Recursive Model for Graph Mining" Deepayan Chakrabarti, Yiping Zhan, Christos
+ * Faloutsos (2004) https://doi.org/10.1137/1.9781611972740.43
  *
  * @tparam IdxT  Type of each node index
  * @tparam ProbT Data type used for probability distributions (either fp32 or fp64)
@@ -49,11 +59,14 @@ namespace raft::random {
  * @param[out] out_dst Destination node id's [on device].  `out_src` and `out_dst`
  *                     together form the struct-of-arrays representation of the same
  *                     output data as `out`.
- * @param[in]  theta   distribution of each quadrant at each level of resolution.
- *                     Since these are probabilities, each of the 2x2 matrices for
- *                     each level of the RMAT must sum to one. [on device]
- *                     [dim = max(r_scale, c_scale) x 2 x 2]. Of course, it is assumed
- *                     that each of the group of 2 x 2 numbers all sum up to 1.
+ * @param[in]  theta   array [on device] with the distribution of each quadrant at each level of
+ *                     resolution. theta = [a0, b0, c0, d0, a1, b1, c1, d1, ...], where
+ *                     [a0, b0, c0, d0]  defines the probability at the finest level (2x2).
+ *                     The last four elements in the array describe the probability in the
+ *                     coarsest level (where matrix size = [2^r_scale, 2^c_scale]).
+ *                     Since these are probabilities, the four [a_i, b_i, c_i, d_i] values for
+ *                     each level of the RMAT must sum to one.
+ *                     [dim = max(r_scale, c_scale) x 2 x 2].
  * @param[in]  r_scale 2^r_scale represents the number of source nodes
  * @param[in]  c_scale 2^c_scale represents the number of destination nodes
  *
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
index fad1d4adfa..6e3d63ab9f 100644
--- a/cpp/include/raft/random/sample_without_replacement.cuh
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -166,4 +166,4 @@ void sample_without_replacement(Args... args)
 
 /** @} */
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace raft::random
diff --git a/cpp/include/raft/solver/detail/lap_kernels.cuh b/cpp/include/raft/solver/detail/lap_kernels.cuh
index 383c3ab713..3c25852240 100644
--- a/cpp/include/raft/solver/detail/lap_kernels.cuh
+++ b/cpp/include/raft/solver/detail/lap_kernels.cuh
@@ -26,6 +26,7 @@
 
 #include "../linear_assignment_types.hpp"
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 
@@ -552,4 +553,4 @@ RAFT_KERNEL kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
   }
 }
 
-}  // namespace raft::solver::detail
\ No newline at end of file
+}  // namespace raft::solver::detail
diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh
index 7ee0f5fbc3..2357c56422 100644
--- a/cpp/include/raft/solver/linear_assignment.cuh
+++ b/cpp/include/raft/solver/linear_assignment.cuh
@@ -331,4 +331,4 @@ class LinearAssignmentProblem {
 
 }  // namespace raft::solver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index b5568ef7d9..ba3efc7ff0 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -43,4 +43,4 @@ void csr_to_coo(
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 081192ed44..73d099a719 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -18,10 +18,12 @@
 
 #pragma once
 
-#include <raft/core/bitmap.cuh>
+#include <raft/core/bitmap.hpp>
+#include <raft/core/bitset.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
 #include <raft/sparse/convert/detail/bitmap_to_csr.cuh>
+#include <raft/sparse/convert/detail/bitset_to_csr.cuh>
 #include <raft/sparse/convert/detail/csr.cuh>
 #include <raft/sparse/csr.hpp>
 
@@ -129,6 +131,80 @@ void bitmap_to_csr(raft::resources const& handle,
   detail::bitmap_to_csr(handle, bitmap, csr);
 }
 
+/**
+ * @brief  Converts a bitset matrix to a Compressed Sparse Row (CSR) format matrix.
+ *
+ * The bitset format inherently supports only a single-row matrix (rows=1). If the CSR matrix
+ * requires multiple rows, the data from the bitset will be repeated for each row in the output.
+ *
+ * Example usage:
+ *
+ * @code{.cpp}
+ * #include <raft/core/resource/cuda_stream.hpp>
+ * #include <raft/sparse/convert/csr.cuh>
+ * #include <rmm/device_uvector.hpp>
+ *
+ * #include <vector>
+ *
+ * using bitset_t = uint32_t;
+ * using index_t  = int;
+ * using value_t  = float;
+ * using nnz_t    = index_t;
+ *
+ * raft::resources handle;
+ * auto stream    = resource::get_cuda_stream(handle);
+ * index_t n_rows = 3;
+ * index_t n_cols = 30;
+ *
+ * nnz_t nnz_for_bitset = 4;
+ * nnz_t nnz_for_csr    = nnz_for_bitset * n_rows;
+ *
+ * index_t bitset_size = (n_cols + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);  //  = 1
+ *
+ * rmm::device_uvector<bitset_t> bitset_d(bitset_size, stream);
+ * std::vector<bitset_t> bitset_h = {
+ *   bitset_t(0b11001010),
+ * };  // nnz_for_bitset = 4;
+ *
+ * raft::copy(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+ *
+ * auto bitset_view = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), n_cols);
+ * auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, n_rows, n_cols, nnz_for_csr);
+ *
+ * raft::sparse::convert::bitset_to_csr(handle, bitset_view, csr);
+ * resource::sync_stream(handle);
+ *
+ * // Results:
+ * // csr.indptr  = [0, 4, 8, 12];
+ * // csr.indices = [1, 3, 6, 7,
+ * //                1, 3, 6, 7,
+ * //                1, 3, 6, 7];
+ * // csr.values  = [1, 1, 1, 1,
+ * //                1, 1, 1, 1,
+ * //                1, 1, 1, 1];
+ * @endcode
+ *
+ * @tparam       bitset_t       The data type of the elements in the bitset matrix.
+ * @tparam       index_t        The data type used for indexing the elements in the matrices.
+ * @tparam       csr_matrix_t   Specifies the CSR matrix type, constrained to
+ * raft::device_csr_matrix.
+ *
+ * @param[in]    handle         The RAFT handle containing the CUDA stream for operations.
+ * @param[in]    bitset         The bitset matrix view, to be converted to CSR format.
+ * @param[out]   csr            Output parameter where the resulting CSR matrix is stored. In the
+ * bitset, each '1' bit corresponds to a non-zero element in the CSR matrix.
+ */
+template <typename bitset_t,
+          typename index_t,
+          typename csr_matrix_t,
+          typename = std::enable_if_t<raft::is_device_csr_matrix_v<csr_matrix_t>>>
+void bitset_to_csr(raft::resources const& handle,
+                   raft::core::bitset_view<bitset_t, index_t> bitset,
+                   csr_matrix_t& csr)
+{
+  detail::bitset_to_csr(handle, bitset, csr);
+}
+
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
index a146113a86..6613049f25 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -64,4 +64,4 @@ void csr_to_dense(cusparseHandle_t handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index 769d5de9be..be62f76502 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -21,6 +21,7 @@
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
+#include <raft/util/device_loads_stores.cuh>
 
 #include <rmm/device_uvector.hpp>
 
@@ -41,61 +42,68 @@ namespace sparse {
 namespace convert {
 namespace detail {
 
-// Threads per block in calc_nnz_by_rows_kernel.
-static const constexpr int calc_nnz_by_rows_tpb = 32;
+// Threads per block in bitmap_to_csr.
+static const constexpr int bitmap_to_csr_tpb = 256;
 
 template <typename bitmap_t, typename index_t, typename nnz_t>
-RAFT_KERNEL __launch_bounds__(calc_nnz_by_rows_tpb) calc_nnz_by_rows_kernel(const bitmap_t* bitmap,
-                                                                            index_t num_rows,
-                                                                            index_t num_cols,
-                                                                            index_t bitmap_num,
-                                                                            nnz_t* nnz_per_row)
+RAFT_KERNEL __launch_bounds__(bitmap_to_csr_tpb) calc_nnz_by_rows_kernel(const bitmap_t* bitmap,
+                                                                         index_t num_rows,
+                                                                         index_t num_cols,
+                                                                         index_t bitmap_num,
+                                                                         nnz_t* sub_col_nnz,
+                                                                         index_t bits_per_sub_col)
 {
-  constexpr bitmap_t FULL_MASK      = ~bitmap_t(0u);
-  constexpr bitmap_t ONE            = bitmap_t(1u);
+  using mutable_bitmap_t = typename std::remove_const_t<bitmap_t>;
+  using BlockReduce      = cub::BlockReduce<index_t, bitmap_to_csr_tpb>;
+
+  __shared__ typename BlockReduce::TempStorage reduce_storage;
+
   constexpr index_t BITS_PER_BITMAP = sizeof(bitmap_t) * 8;
 
-  auto block = cg::this_thread_block();
-  auto tile  = cg::tiled_partition<32>(block);
+  const auto tid = threadIdx.x;
+  const auto row = blockIdx.x;
 
-  int lane_id = threadIdx.x & 0x1f;
+  const auto num_sub_cols = gridDim.y;
+  const auto sub_col      = blockIdx.y;
 
-  for (index_t row = blockIdx.x; row < num_rows; row += gridDim.x) {
-    index_t offset = 0;
-    index_t s_bit  = row * num_cols;
-    index_t e_bit  = s_bit + num_cols;
-    index_t l_sum  = 0;
+  size_t s_bit = size_t(row) * num_cols + sub_col * bits_per_sub_col;
+  size_t e_bit = min(s_bit + bits_per_sub_col, size_t(num_cols) * (row + 1));
 
-    int s_gap = 0;
-    int e_gap = 0;
+  nnz_t l_sum = 0;
+  nnz_t g_sum = 0;
 
-    while (offset < num_cols) {
-      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      std::remove_const_t<bitmap_t> l_bitmap = 0;
+  index_t s_offset  = s_bit % BITS_PER_BITMAP;
+  size_t bitmap_idx = s_bit / BITS_PER_BITMAP;
 
-      if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
+  if (tid == 0 && s_offset != 0) {
+    mutable_bitmap_t l_bitmap = bitmap[bitmap_idx];
 
-      offset += BITS_PER_BITMAP * warpSize;
+    l_bitmap >>= s_offset;
 
-      s_gap = s_bit - bitmap_idx * BITS_PER_BITMAP;
-      if (s_gap > 0) {
-        l_bitmap >>= s_gap;
-        l_bitmap <<= s_gap;
-        offset -= s_gap;
-      }
+    size_t remaining_bits = min(size_t(BITS_PER_BITMAP - s_offset), e_bit - s_bit);
 
-      e_gap = (bitmap_idx + 1) * BITS_PER_BITMAP - e_bit;
-      if (e_gap > 0) {
-        l_bitmap <<= e_gap;
-        l_bitmap >>= e_gap;
-      }
-      l_sum += static_cast<index_t>(raft::detail::popc(l_bitmap));
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
     }
+    l_sum += static_cast<nnz_t>(raft::detail::popc(l_bitmap));
+  }
+  if (s_offset != 0) { s_bit += (BITS_PER_BITMAP - s_offset); }
 
-    l_sum = cg::reduce(tile, l_sum, cg::plus<index_t>());
+  for (size_t bit_idx = s_bit; bit_idx < e_bit; bit_idx += BITS_PER_BITMAP * blockDim.x) {
+    mutable_bitmap_t l_bitmap = 0;
+    bitmap_idx                = bit_idx / BITS_PER_BITMAP + tid;
 
-    if (lane_id == 0) { *(nnz_per_row + row) += static_cast<nnz_t>(l_sum); }
+    index_t remaining_bits = min(BITS_PER_BITMAP, index_t(e_bit - bitmap_idx * BITS_PER_BITMAP));
+
+    if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
+
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
+    }
+    l_sum += static_cast<nnz_t>(raft::detail::popc(l_bitmap));
   }
+  g_sum = BlockReduce(reduce_storage).Reduce(l_sum, cub::Sum());
+  stg(g_sum, sub_col_nnz + sub_col + row * num_sub_cols, tid == 0);
 }
 
 template <typename bitmap_t, typename index_t, typename nnz_t>
@@ -103,144 +111,164 @@ void calc_nnz_by_rows(raft::resources const& handle,
                       const bitmap_t* bitmap,
                       index_t num_rows,
                       index_t num_cols,
-                      nnz_t* nnz_per_row)
+                      nnz_t* sub_col_nnz,
+                      size_t& sub_nnz_size,
+                      index_t& bits_per_sub_col)
 {
-  auto stream              = resource::get_cuda_stream(handle);
-  const index_t total      = num_rows * num_cols;
-  const index_t bitmap_num = raft::ceildiv(total, index_t(sizeof(bitmap_t) * 8));
-
-  int dev_id, sm_count, blocks_per_sm;
+  if (sub_nnz_size == 0) {
+    bits_per_sub_col = bitmap_to_csr_tpb * sizeof(index_t) * 8 * 8;
+    auto grid_dim_y  = (num_cols + bits_per_sub_col - 1) / bits_per_sub_col;
+    sub_nnz_size     = num_rows * ((num_cols + bits_per_sub_col - 1) / bits_per_sub_col);
+    return;
+  }
+  auto stream        = resource::get_cuda_stream(handle);
+  const size_t total = num_rows * num_cols;
+  const size_t bitmap_num =
+    (total + index_t(sizeof(bitmap_t) * 8) - 1) / index_t(sizeof(bitmap_t) * 8);
 
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm, calc_nnz_by_rows_kernel<bitmap_t, index_t, nnz_t>, calc_nnz_by_rows_tpb, 0);
+  auto block_x = num_rows;
+  auto block_y = sub_nnz_size / num_rows;
+  dim3 grid(block_x, block_y, 1);
 
-  index_t max_active_blocks = sm_count * blocks_per_sm;
-  auto grid = std::min(max_active_blocks, raft::ceildiv(bitmap_num, index_t(calc_nnz_by_rows_tpb)));
-  auto block = calc_nnz_by_rows_tpb;
+  auto block = bitmap_to_csr_tpb;
 
-  calc_nnz_by_rows_kernel<bitmap_t, index_t, nnz_t>
-    <<<grid, block, 0, stream>>>(bitmap, num_rows, num_cols, bitmap_num, nnz_per_row);
+  calc_nnz_by_rows_kernel<bitmap_t, index_t, nnz_t><<<grid, block, 0, stream>>>(
+    bitmap, num_rows, num_cols, bitmap_num, sub_col_nnz, bits_per_sub_col);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/*
-  Execute the exclusive_scan within one warp with no inter-warp communication.
-  This function calculates the exclusive prefix sum of `value` across threads within the same warp.
-  Each thread in the warp will end up with the sum of all the values of the threads with lower IDs
-  in the same warp, with the first thread always getting a sum of 0.
-*/
-template <typename value_t>
-RAFT_DEVICE_INLINE_FUNCTION value_t warp_exclusive_scan(value_t value)
-{
-  int lane_id           = threadIdx.x & 0x1f;
-  value_t shifted_value = __shfl_up_sync(0xffffffff, value, 1, warpSize);
-  if (lane_id == 0) shifted_value = 0;
-
-  value_t sum = shifted_value;
-
-  for (int i = 1; i < warpSize; i *= 2) {
-    value_t n = __shfl_up_sync(0xffffffff, sum, i, warpSize);
-    if (lane_id >= i) { sum += n; }
-  }
-  return sum;
-}
-
-// Threads per block in fill_indices_by_rows_kernel.
-static const constexpr int fill_indices_by_rows_tpb = 32;
-
 template <typename bitmap_t, typename index_t, typename nnz_t, bool check_nnz>
-RAFT_KERNEL __launch_bounds__(fill_indices_by_rows_tpb)
+RAFT_KERNEL __launch_bounds__(bitmap_to_csr_tpb)
   fill_indices_by_rows_kernel(const bitmap_t* bitmap,
-                              const index_t* indptr,
-                              index_t num_rows,
-                              index_t num_cols,
+                              index_t* indptr,
+                              size_t num_rows,
+                              size_t num_cols,
                               nnz_t nnz,
-                              index_t bitmap_num,
-                              index_t* indices)
+                              index_t* indices,
+                              nnz_t* sub_col_nnz,
+                              index_t bits_per_sub_col)
 {
-  constexpr bitmap_t FULL_MASK      = ~bitmap_t(0u);
   constexpr bitmap_t ONE            = bitmap_t(1u);
   constexpr index_t BITS_PER_BITMAP = sizeof(bitmap_t) * 8;
 
-  int lane_id = threadIdx.x & 0x1f;
+  using mutable_bitmap_t = typename std::remove_const_t<bitmap_t>;
+  using BlockScan        = cub::BlockScan<int, bitmap_to_csr_tpb>;
+
+  __shared__ typename BlockScan::TempStorage scan_storage;
+
+  const auto tid = threadIdx.x;
+  const auto row = blockIdx.x;
+
+  const auto num_sub_cols = gridDim.y;
+  const auto sub_col      = blockIdx.y;
 
   // Ensure the HBM allocated for CSR values is sufficient to handle all non-zero bitmap bits.
   // An assert will trigger if the allocated HBM is insufficient when `NDEBUG` isn't defined.
   // Note: Assertion is active only if `NDEBUG` is undefined.
   if constexpr (check_nnz) {
-    if (lane_id == 0) { assert(nnz < indptr[num_rows]); }
+    if (tid == 0) { assert(nnz < sub_col_nnz[num_rows * num_sub_cols]); }
   }
 
+  size_t s_bit = size_t(row) * num_cols + sub_col * bits_per_sub_col;
+  size_t e_bit = min(s_bit + bits_per_sub_col, size_t(num_cols) * (row + 1));
+
+  size_t l_sum = 0;
+  __shared__ size_t g_sum;
+
+  index_t s_offset  = s_bit % BITS_PER_BITMAP;
+  size_t bitmap_idx = s_bit / BITS_PER_BITMAP;
+
+  if (tid == 0 && row == 0 && sub_col == 0) { indptr[0] = 0; }
+  if (tid == 0 && sub_col == 0) { indptr[row + 1] = sub_col_nnz[(row + 1) * num_sub_cols]; }
+
+  size_t g_nnz                   = sub_col_nnz[sub_col + row * num_sub_cols];
+  index_t* sub_cols_indices_addr = indices + g_nnz;
+
+  bool guard[BITS_PER_BITMAP];
+
+  index_t g_bits = sub_col * bits_per_sub_col + tid * BITS_PER_BITMAP;
+
+  if (tid == 0 && s_offset != 0) {
+    mutable_bitmap_t l_bitmap = bitmap[bitmap_idx];
+    l_bitmap >>= s_offset;
+
+    size_t remaining_bits = min(size_t(BITS_PER_BITMAP - s_offset), e_bit - s_bit);
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
+    }
+
+#pragma unroll
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      guard[i] = l_bitmap & (ONE << i);
+    }
 #pragma unroll
-  for (index_t row = blockIdx.x; row < num_rows; row += gridDim.x) {
-    index_t g_sum      = 0;
-    index_t s_bit      = row * num_cols;
-    index_t e_bit      = s_bit + num_cols;
-    index_t indptr_row = indptr[row];
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      stg(index_t(i + g_bits), sub_cols_indices_addr + l_sum, guard[i]);
+      l_sum += guard[i];
+    }
+  }
+
+  if (tid == 0) { g_sum = l_sum; }
+  __syncthreads();
+
+  if (s_offset != 0) {
+    s_bit += (BITS_PER_BITMAP - s_offset);
+    g_bits += (BITS_PER_BITMAP - s_offset);
+  }
+
+  for (size_t bit_idx = s_bit; bit_idx < e_bit; bit_idx += BITS_PER_BITMAP * blockDim.x) {
+    mutable_bitmap_t l_bitmap = 0;
+    bitmap_idx                = bit_idx / BITS_PER_BITMAP + tid;
+
+    if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
+
+    index_t remaining_bits = min(BITS_PER_BITMAP, index_t(e_bit - bitmap_idx * BITS_PER_BITMAP));
+    if (remaining_bits < BITS_PER_BITMAP) {
+      l_bitmap &= ((mutable_bitmap_t(1) << remaining_bits) - 1);
+    }
+
+    int l_bits    = raft::detail::popc(l_bitmap);
+    int l_sum_32b = 0;
+    BlockScan(scan_storage).InclusiveSum(l_bits, l_sum_32b);
+    l_sum = l_sum_32b + g_sum - l_bits;
+    __syncthreads();
 
 #pragma unroll
-    for (index_t offset = 0; offset < num_cols; offset += BITS_PER_BITMAP * warpSize) {
-      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      std::remove_const_t<bitmap_t> l_bitmap = 0;
-      index_t l_offset = offset + lane_id * BITS_PER_BITMAP - (s_bit % BITS_PER_BITMAP);
-
-      if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
-
-      if (s_bit > bitmap_idx * BITS_PER_BITMAP) {
-        l_bitmap >>= (s_bit - bitmap_idx * BITS_PER_BITMAP);
-        l_bitmap <<= (s_bit - bitmap_idx * BITS_PER_BITMAP);
-      }
-
-      if ((bitmap_idx + 1) * BITS_PER_BITMAP > e_bit) {
-        l_bitmap <<= ((bitmap_idx + 1) * BITS_PER_BITMAP - e_bit);
-        l_bitmap >>= ((bitmap_idx + 1) * BITS_PER_BITMAP - e_bit);
-      }
-
-      index_t l_sum =
-        g_sum + warp_exclusive_scan(static_cast<index_t>(raft::detail::popc(l_bitmap)));
-
-      for (int i = 0; i < BITS_PER_BITMAP; i++) {
-        if (l_bitmap & (ONE << i)) {
-          indices[indptr_row + l_sum] = l_offset + i;
-          l_sum++;
-        }
-      }
-      g_sum = __shfl_sync(0xffffffff, l_sum, warpSize - 1);
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      guard[i] = l_bitmap & (ONE << i);
     }
+#pragma unroll
+    for (int i = 0; i < BITS_PER_BITMAP; i++) {
+      stg(index_t(i + g_bits), sub_cols_indices_addr + l_sum, guard[i]);
+      l_sum += guard[i];
+    }
+
+    if (threadIdx.x == (bitmap_to_csr_tpb - 1)) { g_sum += (l_sum_32b); }
+    g_bits += BITS_PER_BITMAP * blockDim.x;
   }
 }
 
 template <typename bitmap_t, typename index_t, typename nnz_t, bool check_nnz = false>
 void fill_indices_by_rows(raft::resources const& handle,
                           const bitmap_t* bitmap,
-                          const index_t* indptr,
+                          index_t* indptr,
                           index_t num_rows,
                           index_t num_cols,
                           nnz_t nnz,
-                          index_t* indices)
+                          index_t* indices,
+                          nnz_t* sub_col_nnz,
+                          index_t bits_per_sub_col,
+                          size_t sub_nnz_size)
 {
-  auto stream              = resource::get_cuda_stream(handle);
-  const index_t total      = num_rows * num_cols;
-  const index_t bitmap_num = raft::ceildiv(total, index_t(sizeof(bitmap_t) * 8));
-
-  int dev_id, sm_count, blocks_per_sm;
-
-  cudaGetDevice(&dev_id);
-  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &blocks_per_sm,
-    fill_indices_by_rows_kernel<bitmap_t, index_t, nnz_t, check_nnz>,
-    fill_indices_by_rows_tpb,
-    0);
-
-  index_t max_active_blocks = sm_count * blocks_per_sm;
-  auto grid                 = std::min(max_active_blocks, num_rows);
-  auto block                = fill_indices_by_rows_tpb;
-
-  fill_indices_by_rows_kernel<bitmap_t, index_t, nnz_t, check_nnz>
-    <<<grid, block, 0, stream>>>(bitmap, indptr, num_rows, num_cols, nnz, bitmap_num, indices);
+  auto stream  = resource::get_cuda_stream(handle);
+  auto block_x = num_rows;
+  auto block_y = sub_nnz_size / num_rows;
+  dim3 grid(block_x, block_y, 1);
+
+  auto block = bitmap_to_csr_tpb;
+
+  fill_indices_by_rows_kernel<bitmap_t, index_t, nnz_t, check_nnz><<<grid, block, 0, stream>>>(
+    bitmap, indptr, num_rows, num_cols, nnz, indices, sub_col_nnz, bits_per_sub_col);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -252,12 +280,9 @@ void bitmap_to_csr(raft::resources const& handle,
                    raft::core::bitmap_view<bitmap_t, index_t> bitmap,
                    csr_matrix_t& csr)
 {
+  using nnz_t   = typename csr_matrix_t::nnz_type;
   auto csr_view = csr.structure_view();
 
-  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0 || csr_view.get_nnz() == 0) {
-    return;
-  }
-
   RAFT_EXPECTS(bitmap.get_n_rows() == csr_view.get_n_rows(),
                "Number of rows in bitmap must be equal to "
                "number of rows in csr");
@@ -266,6 +291,8 @@ void bitmap_to_csr(raft::resources const& handle,
                "Number of columns in bitmap must be equal to "
                "number of columns in csr");
 
+  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0) { return; }
+
   auto thrust_policy = resource::get_thrust_policy(handle);
   auto stream        = resource::get_cuda_stream(handle);
 
@@ -274,25 +301,52 @@ void bitmap_to_csr(raft::resources const& handle,
 
   RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
 
-  calc_nnz_by_rows(handle, bitmap.data(), csr_view.get_n_rows(), csr_view.get_n_cols(), indptr);
-  thrust::exclusive_scan(thrust_policy, indptr, indptr + csr_view.get_n_rows() + 1, indptr);
+  size_t sub_nnz_size      = 0;
+  index_t bits_per_sub_col = 0;
+
+  // Get buffer size and number of bits per each sub-columns
+  calc_nnz_by_rows(handle,
+                   bitmap.data(),
+                   csr_view.get_n_rows(),
+                   csr_view.get_n_cols(),
+                   static_cast<nnz_t*>(nullptr),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
+  rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
+
+  calc_nnz_by_rows(handle,
+                   bitmap.data(),
+                   csr_view.get_n_rows(),
+                   csr_view.get_n_cols(),
+                   sub_nnz.data(),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  thrust::exclusive_scan(
+    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
 
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
-    index_t nnz = 0;
+    nnz_t nnz = 0;
     RAFT_CUDA_TRY(cudaMemcpyAsync(
-      &nnz, indptr + csr_view.get_n_rows(), sizeof(index_t), cudaMemcpyDeviceToHost, stream));
+      &nnz, sub_nnz.data() + sub_nnz_size, sizeof(nnz_t), cudaMemcpyDeviceToHost, stream));
     resource::sync_stream(handle);
     csr.initialize_sparsity(nnz);
+    if (nnz == 0) return;
   }
+
   constexpr bool check_nnz = is_device_csr_sparsity_preserving_v<csr_matrix_t>;
-  fill_indices_by_rows<bitmap_t, index_t, typename csr_matrix_t::nnz_type, check_nnz>(
-    handle,
-    bitmap.data(),
-    indptr,
-    csr_view.get_n_rows(),
-    csr_view.get_n_cols(),
-    csr_view.get_nnz(),
-    indices);
+  fill_indices_by_rows<bitmap_t, index_t, nnz_t, check_nnz>(handle,
+                                                            bitmap.data(),
+                                                            indptr,
+                                                            csr_view.get_n_rows(),
+                                                            csr_view.get_n_cols(),
+                                                            csr_view.get_nnz(),
+                                                            indices,
+                                                            sub_nnz.data(),
+                                                            bits_per_sub_col,
+                                                            sub_nnz_size);
 
   thrust::fill_n(thrust_policy,
                  csr.get_elements().data(),
diff --git a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
new file mode 100644
index 0000000000..b3b341d793
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/detail/mdspan_util.cuh>  // detail::popc
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/convert/detail/adj_to_csr.cuh>
+#include <raft/sparse/convert/detail/bitmap_to_csr.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <thrust/copy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <assert.h>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+namespace detail {
+
+template <typename index_t, typename nnz_t>
+RAFT_KERNEL repeat_csr_kernel(const index_t* indptr,
+                              const index_t* indices,
+                              index_t* repeated_indptr,
+                              index_t* repeated_indices,
+                              nnz_t nnz,
+                              index_t repeat_count)
+{
+  int global_id                  = blockIdx.x * blockDim.x + threadIdx.x;
+  bool guard                     = global_id < nnz;
+  index_t* repeated_indices_addr = repeated_indices + global_id;
+
+  for (index_t i = global_id; i < repeat_count; i += gridDim.x * blockDim.x) {
+    repeated_indptr[i] = (i + 2) * nnz;
+  }
+
+  __syncthreads();
+
+  index_t item;
+  item = (global_id < nnz) ? indices[global_id] : -1;
+
+  __syncthreads();
+
+  for (index_t row = 0; row < repeat_count; ++row) {
+    index_t start_offset = row * nnz;
+    if (guard) { repeated_indices_addr[start_offset] = item; }
+  }
+}
+
+template <typename index_t, typename nnz_t>
+void gpu_repeat_csr(raft::resources const& handle,
+                    const index_t* d_indptr,
+                    const index_t* d_indices,
+                    nnz_t nnz,
+                    index_t repeat_count,
+                    index_t* d_repeated_indptr,
+                    index_t* d_repeated_indices)
+{
+  if (nnz == 0) return;
+
+  auto stream            = resource::get_cuda_stream(handle);
+  index_t repeat_csr_tpb = 256;
+  index_t grid           = (nnz + repeat_csr_tpb - 1) / (repeat_csr_tpb);
+
+  repeat_csr_kernel<<<grid, repeat_csr_tpb, 0, stream>>>(
+    d_indptr, d_indices, d_repeated_indptr, d_repeated_indices, nnz, repeat_count);
+}
+
+template <typename bitset_t,
+          typename index_t,
+          typename csr_matrix_t,
+          typename = std::enable_if_t<raft::is_device_csr_matrix_v<csr_matrix_t>>>
+void bitset_to_csr(raft::resources const& handle,
+                   raft::core::bitset_view<bitset_t, index_t> bitset,
+                   csr_matrix_t& csr)
+{
+  using row_t = typename csr_matrix_t::row_type;
+  using nnz_t = typename csr_matrix_t::nnz_type;
+
+  auto csr_view = csr.structure_view();
+
+  RAFT_EXPECTS(bitset.size() == csr_view.get_n_cols(),
+               "Number of size in bitset must be equal to "
+               "number of columns in csr");
+  if (csr_view.get_n_rows() == 0 || csr_view.get_n_cols() == 0) { return; }
+
+  auto thrust_policy = resource::get_thrust_policy(handle);
+  auto stream        = resource::get_cuda_stream(handle);
+
+  index_t* indptr  = csr_view.get_indptr().data();
+  index_t* indices = csr_view.get_indices().data();
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+
+  size_t sub_nnz_size      = 0;
+  index_t bits_per_sub_col = 0;
+
+  // Get buffer size and number of bits per each sub-columns
+  calc_nnz_by_rows(handle,
+                   bitset.data(),
+                   row_t(1),
+                   csr_view.get_n_cols(),
+                   static_cast<nnz_t*>(nullptr),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
+  rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
+
+  calc_nnz_by_rows(handle,
+                   bitset.data(),
+                   row_t(1),
+                   csr_view.get_n_cols(),
+                   sub_nnz.data(),
+                   sub_nnz_size,
+                   bits_per_sub_col);
+
+  thrust::exclusive_scan(
+    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+
+  nnz_t bitset_nnz = 0;
+  if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      &bitset_nnz, sub_nnz.data() + sub_nnz_size, sizeof(nnz_t), cudaMemcpyDeviceToHost, stream));
+    resource::sync_stream(handle);
+    csr.initialize_sparsity(bitset_nnz * csr_view.get_n_rows());
+    if (bitset_nnz == 0) return;
+  } else {
+    bitset_nnz = csr_view.get_nnz() / csr_view.get_n_rows();
+  }
+
+  constexpr bool check_nnz = is_device_csr_sparsity_preserving_v<csr_matrix_t>;
+  fill_indices_by_rows<bitset_t, index_t, nnz_t, check_nnz>(handle,
+                                                            bitset.data(),
+                                                            indptr,
+                                                            1,
+                                                            csr_view.get_n_cols(),
+                                                            csr_view.get_nnz(),
+                                                            indices,
+                                                            sub_nnz.data(),
+                                                            bits_per_sub_col,
+                                                            sub_nnz_size);
+  if (csr_view.get_n_rows() > 1) {
+    gpu_repeat_csr<index_t, nnz_t>(handle,
+                                   indptr,
+                                   indices,
+                                   bitset_nnz,
+                                   csr_view.get_n_rows() - 1,
+                                   indptr + 2,
+                                   indices + bitset_nnz);
+  }
+
+  thrust::fill_n(thrust_policy,
+                 csr.get_elements().data(),
+                 csr_view.get_nnz(),
+                 typename csr_matrix_t::element_type(1));
+}
+
+};  // end NAMESPACE detail
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index 0a498bb1ca..469dac3c86 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -76,4 +76,4 @@ void csr_to_coo(
 };  // end NAMESPACE detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index e60e494d34..ec3d0ec1c3 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -141,4 +141,4 @@ void csr_to_dense(cusparseHandle_t handle,
 };  // namespace detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index 91ba363168..9a38c11a07 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -182,7 +182,7 @@ class COO {
    * @param n_rows: number of rows in the dense matrix
    * @param n_cols: number of columns in the dense matrix
    */
-  void setSize(int n_rows, int n_cols)
+  void setSize(Index_Type n_rows, Index_Type n_cols)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
@@ -192,7 +192,7 @@ class COO {
    * @brief Set the number of rows and cols for a square dense matrix
    * @param n: number of rows and cols
    */
-  void setSize(int n)
+  void setSize(Index_Type n)
   {
     this->n_rows = n;
     this->n_cols = n;
@@ -204,7 +204,10 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
+  void allocate(Index_Type nnz, bool init, cudaStream_t stream)
+  {
+    this->allocate(nnz, 0, init, stream);
+  }
 
   /**
    * @brief Allocate the underlying arrays
@@ -213,7 +216,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream)
+  void allocate(Index_Type nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -226,7 +229,8 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
+  void allocate(
+    Index_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
index e7d81f51aa..d5262581a3 100644
--- a/cpp/include/raft/sparse/detail/cusparse_macros.h
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -20,4 +20,4 @@
 
 #pragma once
 
-#include <raft/core/cusparse_macros.hpp>
\ No newline at end of file
+#include <raft/core/cusparse_macros.hpp>
diff --git a/cpp/include/raft/sparse/distance/detail/common.hpp b/cpp/include/raft/sparse/distance/detail/common.hpp
index 0f463dac80..19fe9c1786 100644
--- a/cpp/include/raft/sparse/distance/detail/common.hpp
+++ b/cpp/include/raft/sparse/distance/detail/common.hpp
@@ -56,4 +56,4 @@ class distances_t {
 };  // namespace detail
 };  // namespace distance
 };  // namespace sparse
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 38aa106d78..59cfcfa186 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -229,4 +229,4 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
 }  // namespace detail
 }  // namespace distance
 }  // namespace sparse
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
index 5a1c152bd0..4a075cf530 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -116,4 +116,4 @@ class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
 }  // namespace detail
 }  // namespace distance
 }  // namespace sparse
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index ead44f0c51..5bcd1ff005 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -221,4 +221,4 @@ void pairwise_distance(raft::resources const& handle,
 };  // namespace sparse
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index def305afb2..a97b935f58 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -96,4 +96,4 @@ void csr_add_finalize(const int* a_ind,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 57c9b986b4..8ac97259da 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -120,4 +120,4 @@ void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
index 276960628d..bfffa413b2 100644
--- a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -41,7 +42,7 @@ template <typename value_t, typename output_t, typename index_t, typename nnz_t,
 void masked_matmul(raft::resources const& handle,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
                    raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
-                   raft::core::bitmap_view<const bitmap_t, index_t>& mask,
+                   raft::core::bitmap_view<bitmap_t, index_t>& mask,
                    raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t>& C,
                    std::optional<raft::host_scalar_view<output_t>> alpha,
                    std::optional<raft::host_scalar_view<output_t>> beta)
@@ -100,6 +101,69 @@ void masked_matmul(raft::resources const& handle,
   }
 }
 
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitset_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major>& B,
+                   raft::core::bitset_view<bitset_t, index_t>& mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t>& C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha,
+                   std::optional<raft::host_scalar_view<output_t>> beta)
+{
+  index_t m   = A.extent(0);
+  index_t n   = B.extent(0);
+  index_t dim = A.extent(1);
+
+  auto compressed_C_view = C.structure_view();
+
+  RAFT_EXPECTS(A.extent(1) == B.extent(1), "The dim of A must be equal to the dim of B.");
+  RAFT_EXPECTS(A.extent(0) == compressed_C_view.get_n_rows(),
+               "Number of rows in C must match the number of rows in A.");
+  RAFT_EXPECTS(B.extent(0) == compressed_C_view.get_n_cols(),
+               "Number of columns in C must match the number of columns in B.");
+
+  auto stream = raft::resource::get_cuda_stream(handle);
+
+  auto C_matrix = raft::make_device_csr_matrix<output_t, index_t>(handle, compressed_C_view);
+
+  // fill C
+  raft::sparse::convert::bitset_to_csr(handle, mask, C_matrix);
+
+  if (m > 10 || alpha.has_value() || beta.has_value()) {
+    auto C_view = raft::make_device_csr_matrix_view<output_t, index_t, index_t, index_t>(
+      C.get_elements().data(), compressed_C_view);
+
+    // create B col_major view
+    auto B_col_major = raft::make_device_matrix_view<const value_t, index_t, raft::col_major>(
+      B.data_handle(), dim, n);
+
+    output_t default_alpha = static_cast<output_t>(1.0f);
+    output_t default_beta  = static_cast<output_t>(0.0f);
+
+    if (!alpha.has_value()) { alpha = raft::make_host_scalar_view<output_t>(&default_alpha); }
+    if (!beta.has_value()) { beta = raft::make_host_scalar_view<output_t>(&default_beta); }
+
+    raft::sparse::linalg::sddmm(handle,
+                                A,
+                                B_col_major,
+                                C_view,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                raft::linalg::Operation::NON_TRANSPOSE,
+                                *alpha,
+                                *beta);
+  } else {
+    raft::sparse::distance::detail::faster_dot_on_csr(handle,
+                                                      C.get_elements().data(),
+                                                      compressed_C_view.get_nnz(),
+                                                      compressed_C_view.get_indptr().data(),
+                                                      compressed_C_view.get_indices().data(),
+                                                      A.data_handle(),
+                                                      B.data_handle(),
+                                                      compressed_C_view.get_n_rows(),
+                                                      dim);
+  }
+}
+
 }  // namespace detail
 }  // namespace linalg
 }  // namespace sparse
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 3702111f83..2619048388 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -232,4 +232,4 @@ void rowNormCsrCaller(const IdxType* ia,
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index 3a646b9a6e..579ee88d38 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -107,4 +107,4 @@ void csr_transpose(cusparseHandle_t handle,
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/masked_matmul.cuh
new file mode 100644
index 0000000000..c33a1afd43
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain A copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/sparse/linalg/detail/masked_matmul.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @defgroup masked_matmul Masked Matrix Multiplication
+ * @{
+ */
+
+/**
+ * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
+ * multiplication with the sparsity pattern defined by the mask, resulting in the computation
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
+ *
+ * This function multiplies two dense matrices A and B, and then applies an element-wise
+ * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
+ * and added to beta times the original matrix C.
+ *
+ * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
+ * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
+ * @tparam index_t Type used for matrix indices
+ * @tparam nnz_t Type used for the number of non-zero entries in CSR format
+ * @tparam bitmap_t Type of the bitmap used for the mask
+ *
+ * @param[in] handle RAFT handle for resource management
+ * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
+ * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
+ * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
+ * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
+ * included (1) or masked out (0).
+ * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
+ * n]
+ * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
+ * std::nullopt)
+ * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ */
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitmap_view<bitmap_t, index_t> mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/**
+ * @brief Computes a sparse matrix product with a masked sparsity pattern and scaling.
+ *
+ * This function computes the result of:
+ * C = alpha * ((A * B) ∘ spy(mask)) + beta * C
+ * where:
+ * - A and B are dense input matrices.
+ * - "mask" defines the sparsity pattern for element-wise multiplication.
+ * - The result is scaled by alpha and added to beta times the original C.
+ *
+ * **Special behavior of the mask**:
+ * - The `bitset` mask represents a single row of data, with its bits indicating whether
+ *   each corresponding element in (A * B) is included (1) or masked out (0).
+ * - If the output CSR matrix `C` has multiple rows, the `bitset` is logically repeated
+ *   across all rows of `C`. For example, if `C` has `n_rows` rows, the same `bitset`
+ *   pattern is applied to all rows.
+ *
+ * @tparam value_t    Data type of input matrix elements (e.g., half, float, double).
+ * @tparam output_t   Data type of output matrix elements (e.g., float, double).
+ * @tparam index_t    Type for matrix indices.
+ * @tparam nnz_t      Type for non-zero entries in CSR format.
+ * @tparam bitset_t   Type for the bitset mask.
+ *
+ * @param[in] handle  RAFT handle for managing resources.
+ * @param[in] A       Dense input matrix [m, k] (row-major).
+ * @param[in] B       Dense input matrix [n, k] (row-major).
+ * @param[in] mask    Bitmap view representing a single row [1, n], where each bit
+ *                    indicates if the corresponding element in (A * B) is included (1)
+ *                    or masked out (0). The pattern is repeated for all rows of `C`.
+ * @param[inout] C    Output sparse matrix in CSR format [m, n].
+ * @param[in] alpha   Scalar multiplier for (A * B) (default: 1.0 if std::nullopt).
+ * @param[in] beta    Scalar multiplier for the initial C (default: 0 if std::nullopt).
+ */
+template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitset_t>
+void masked_matmul(raft::resources const& handle,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
+                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
+                   raft::core::bitset_view<bitset_t, index_t> mask,
+                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
+                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
+                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
+{
+  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
+}
+
+/** @} */  // end of masked_matmul
+
+}  // end namespace linalg
+}  // end namespace sparse
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/linalg/masked_matmul.hpp b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
index 6cf6e834b9..32322b90f6 100644
--- a/cpp/include/raft/sparse/linalg/masked_matmul.hpp
+++ b/cpp/include/raft/sparse/linalg/masked_matmul.hpp
@@ -13,60 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
-
-#include <raft/sparse/linalg/detail/masked_matmul.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
 /**
- * @defgroup masked_matmul Masked Matrix Multiplication
- * @{
+ * This file is deprecated and will be removed in future release.
+ * Please use the cuh version instead.
  */
 
 /**
- * @brief Performs a masked multiplication of dense matrices A and B, followed by an element-wise
- * multiplication with the sparsity pattern defined by the mask, resulting in the computation
- * C = alpha * ((A * B) ∘ spy(mask)) + beta * C.
- *
- * This function multiplies two dense matrices A and B, and then applies an element-wise
- * multiplication using the sparsity pattern provided by the mask. The result is scaled by alpha
- * and added to beta times the original matrix C.
- *
- * @tparam value_t Data type of elements in the input matrices (e.g., half, float, double)
- * @tparam output_t Data type of elements in the output matrices (e.g., float, double)
- * @tparam index_t Type used for matrix indices
- * @tparam nnz_t Type used for the number of non-zero entries in CSR format
- * @tparam bitmap_t Type of the bitmap used for the mask
- *
- * @param[in] handle RAFT handle for resource management
- * @param[in] A Input dense matrix (device_matrix_view) with shape [m, k]
- * @param[in] B Input dense matrix (device_matrix_view) with shape [n, k]
- * @param[in] mask Bitmap view representing the sparsity pattern (bitmap_view) with logical shape
- * [m, n]. Each bit in the mask indicates whether the corresponding element pair in A and B is
- * included (1) or masked out (0).
- * @param[inout] C Output sparse matrix in CSR format (device_csr_matrix_view) with dense shape [m,
- * n]
- * @param[in] alpha Optional scalar multiplier for the product of A and B (default: 1.0 if
- * std::nullopt)
- * @param[in] beta Optional scalar multiplier for the original matrix C (default: 0 if std::nullopt)
+ * DISCLAIMER: this file is deprecated: use masked_matmul.cuh instead
  */
-template <typename value_t, typename output_t, typename index_t, typename nnz_t, typename bitmap_t>
-void masked_matmul(raft::resources const& handle,
-                   raft::device_matrix_view<const value_t, index_t, raft::row_major> A,
-                   raft::device_matrix_view<const value_t, index_t, raft::row_major> B,
-                   raft::core::bitmap_view<const bitmap_t, index_t> mask,
-                   raft::device_csr_matrix_view<output_t, index_t, index_t, nnz_t> C,
-                   std::optional<raft::host_scalar_view<output_t>> alpha = std::nullopt,
-                   std::optional<raft::host_scalar_view<output_t>> beta  = std::nullopt)
-{
-  detail::masked_matmul(handle, A, B, mask, C, alpha, beta);
-}
 
-/** @} */  // end of masked_matmul
+#pragma once
+
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
+#pragma message(__FILE__                                                    \
+                  " is deprecated and will be removed in a future release." \
+                  " Please use the cuh version instead.")
+#endif
 
-}  // end namespace linalg
-}  // end namespace sparse
-}  // end namespace raft
+#include <raft/sparse/linalg/masked_matmul.cuh>
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 43dd182fe5..7adf245abc 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -104,4 +104,4 @@ void rowNormCsr(raft::resources const& handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 4c0595bf91..276a64c125 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -40,4 +40,4 @@ void fit_embedding(raft::resources const& handle,
 };  // namespace sparse
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 1de8d5b426..8ee53cd3ae 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -165,4 +165,4 @@ void symmetrize(raft::resources const& handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index 4333060ad9..304cbf4936 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -68,4 +68,4 @@ void csr_transpose(raft::resources const& handle,
 
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh
index c94c6254c3..ed4aa4c98f 100644
--- a/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/neighbors/cross_component_nn.cuh
@@ -96,4 +96,4 @@ void cross_component_nn(
                              metric);
 }
 
-};  // end namespace raft::sparse::neighbors
\ No newline at end of file
+};  // end namespace raft::sparse::neighbors
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index c64c05ae4e..4b329325ca 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -91,4 +91,4 @@ void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 52f1d3b239..b03192f111 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -84,4 +84,4 @@ void max_duplicates(raft::resources const& handle,
 };  // END namespace sparse
 };  // END namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index a799093226..b8d5a49d9f 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -45,4 +45,4 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh
index 2da6dad4fc..e8a456d23e 100644
--- a/cpp/include/raft/sparse/op/slice.cuh
+++ b/cpp/include/raft/sparse/op/slice.cuh
@@ -78,4 +78,4 @@ void csr_row_slice_populate(value_idx start_offset,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 02a77a0d99..ddfa01731a 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -24,7 +24,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -624,7 +624,7 @@ static int lanczosRestart(raft::resources const& handle,
   value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t* V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + (size_t)n * (size_t)iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -641,7 +641,7 @@ static int lanczosRestart(raft::resources const& handle,
   // std::cout <<std::endl;
 
   // Initialize similarity transform with identity matrix
-  memset(V_host, 0, iter * iter * sizeof(value_type_t));
+  memset(V_host, 0, (size_t)iter * (size_t)iter * (size_t)sizeof(value_type_t));
   for (i = 0; i < iter; ++i)
     V_host[IDX(i, i, iter)] = 1;
 
@@ -679,8 +679,11 @@ static int lanczosRestart(raft::resources const& handle,
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(V_dev,
+                                V_host,
+                                (size_t)iter * (size_t)iter * (size_t)sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
 
   beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h,
@@ -716,7 +719,7 @@ static int lanczosRestart(raft::resources const& handle,
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
                                 work_dev,
-                                n * iter_new * sizeof(value_type_t),
+                                (size_t)n * (size_t)iter_new * (size_t)sizeof(value_type_t),
                                 cudaMemcpyDeviceToDevice,
                                 stream));
 
@@ -1045,10 +1048,10 @@ int computeSmallestEigenvectors(
   unsigned long long seed = 1234567)
 {
   // Matrix dimension
-  index_type_t n = A.nrows_;
+  size_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && (size_t)nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1395,10 +1398,10 @@ int computeLargestEigenvectors(
   unsigned long long seed = 123456)
 {
   // Matrix dimension
-  index_type_t n = A.nrows_;
+  size_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && (size_t)nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
diff --git a/cpp/include/raft/sparse/solver/lanczos.cuh b/cpp/include/raft/sparse/solver/lanczos.cuh
index fed31e6a9c..4c45a28cc6 100644
--- a/cpp/include/raft/sparse/solver/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/lanczos.cuh
@@ -230,4 +230,4 @@ int computeLargestEigenvectors(
 
 }  // namespace raft::sparse::solver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/cluster_solvers.cuh b/cpp/include/raft/spectral/cluster_solvers.cuh
index b693ac4af3..c273808cf8 100644
--- a/cpp/include/raft/spectral/cluster_solvers.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers.cuh
@@ -97,4 +97,4 @@ struct kmeans_solver_t {
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
index 40b0324548..139df1d27f 100644
--- a/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
+++ b/cpp/include/raft/spectral/cluster_solvers_deprecated.cuh
@@ -87,4 +87,4 @@ struct kmeans_solver_deprecated_t {
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 1fe078bd32..db8a5dc9ef 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -39,14 +39,14 @@
 // =========================================================
 
 // Get index of matrix entry
-#define IDX(i, j, lda) ((i) + (j) * (lda))
+#define IDX(i, j, lda) ((size_t)(i) + (j) * (lda))
 
 namespace raft {
 namespace spectral {
 namespace matrix {
 namespace detail {
 
-using size_type = int;  // for now; TODO: move it in appropriate header
+using size_type = size_t;  // for now; TODO: move it in appropriate header
 
 // Apply diagonal matrix to vector:
 //
@@ -326,7 +326,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
         raft_handle, row_offsets, col_indices, values, nrows, nnz),
       diagonal_(raft_handle, nrows)
   {
-    vector_t<value_type> ones{raft_handle, nrows};
+    vector_t<value_type> ones{raft_handle, (size_t)nrows};
     ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
@@ -341,7 +341,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
                                               csr_m.nnz_),
       diagonal_(raft_handle, csr_m.nrows_)
   {
-    vector_t<value_type> ones{raft_handle, csr_m.nrows_};
+    vector_t<value_type> ones{raft_handle, (size_t)csr_m.nrows_};
     ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
index ab1398a2a1..6514f7ef21 100644
--- a/cpp/include/raft/spectral/modularity_maximization.cuh
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -83,4 +83,4 @@ void analyzeModularity(raft::resources const& handle,
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index f7ea456ac5..a2ac328aa1 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -92,4 +92,4 @@ void analyzePartition(raft::resources const& handle,
 }  // namespace spectral
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
index 6625d38a7a..0b352e185b 100644
--- a/cpp/include/raft/stats/accuracy.cuh
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -75,4 +75,4 @@ float accuracy(raft::resources const& handle,
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 1f97cd5f76..6822e069a2 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -86,4 +86,4 @@ double adjusted_rand_index(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index b669e0de32..f4667b37dc 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -88,4 +88,4 @@ double completeness_score(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index 16f0998435..03fa0d4924 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -214,4 +214,4 @@ void contingency_matrix(Args... args)
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
index ad5d233c0e..096ec4bc1c 100644
--- a/cpp/include/raft/stats/cov.cuh
+++ b/cpp/include/raft/stats/cov.cuh
@@ -119,4 +119,4 @@ void cov(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index ee39c87a68..1262d538c8 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -27,7 +27,25 @@ namespace stats {
 namespace detail {
 
 template <typename Type, typename IdxType = int>
-void mean(
+void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
+  Type ratio = Type(1) / Type(N);
+  raft::linalg::reduce(mu,
+                       data,
+                       D,
+                       N,
+                       Type(0),
+                       rowMajor,
+                       false,
+                       stream,
+                       false,
+                       raft::identity_op(),
+                       raft::add_op(),
+                       raft::mul_const_op<Type>(ratio));
+}
+
+template <typename Type, typename IdxType = int>
+[[deprecated]] void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
   Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
@@ -47,4 +65,4 @@ void mean(
 
 }  // namespace detail
 }  // namespace stats
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 947df6848a..66951f52ab 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -59,7 +59,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
   rmm::device_scalar<math_t> y_bar(stream);
 
-  raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream);
+  raft::stats::mean(y_bar.data(), y, 1, n, false, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   rmm::device_uvector<math_t> sse_arr(n, stream);
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 4c861b49fb..c758584ec9 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -120,4 +120,4 @@ void vars(Type* var,
 
 }  // namespace detail
 }  // namespace stats
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 39bd2c3b6c..4f5438b133 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -34,4 +34,4 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
 
 }  // namespace detail
 }  // namespace stats
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index ada0995f7d..9b96ed5949 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -72,4 +72,4 @@ void weightedMean(Type* mu,
 }
 };  // end namespace detail
 };  // end namespace stats
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
index ded7c8178b..444cc04bca 100644
--- a/cpp/include/raft/stats/dispersion.cuh
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -131,4 +131,4 @@ value_t cluster_dispersion(
 }  // end namespace stats
 }  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index fe432569ee..a0c6ae5bdb 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -83,4 +83,4 @@ double entropy(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index 311cd599f8..3095d2c724 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -91,4 +91,4 @@ double homogeneity_score(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 43d39cfd6c..b76b945400 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,27 @@ namespace stats {
  * @param data: the input matrix
  * @param D: number of columns of data
  * @param N: number of rows of data
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
+  detail::mean(mu, data, D, N, rowMajor, stream);
+}
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ * Note: This call is deprecated, please use `mean` call without `sample` parameter.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
  * @param sample: whether to evaluate sample mean or not. In other words,
  * whether
  *  to normalize the output using N-1 or N, for true or false, respectively
@@ -45,7 +66,7 @@ namespace stats {
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(
+[[deprecated("'sample' parameter deprecated")]] void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
   detail::mean(mu, data, D, N, sample, rowMajor, stream);
@@ -67,14 +88,47 @@ void mean(
  * @param[in]  handle the raft handle
  * @param[in]  data: the input matrix
  * @param[out] mu: the output mean vector
- * @param[in]  sample: whether to evaluate sample mean or not. In other words, whether
- *   to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
 void mean(raft::resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
-          raft::device_vector_view<value_t, idx_t> mu,
-          bool sample)
+          raft::device_vector_view<value_t, idx_t> mu)
+{
+  static_assert(
+    std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
+    "Data layout not supported");
+  RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
+  RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
+  RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
+  detail::mean(mu.data_handle(),
+               data.data_handle(),
+               data.extent(1),
+               data.extent(0),
+               std::is_same_v<layout_t, raft::row_major>,
+               resource::get_cuda_stream(handle));
+}
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ * Note: This call is deprecated, please use `mean` call without `sample` parameter.
+ *
+ * @tparam value_t the data type
+ * @tparam idx_t index type
+ * @tparam layout_t Layout type of the input matrix.
+ * @param[in]  handle the raft handle
+ * @param[in]  data: the input matrix
+ * @param[out] mu: the output mean vector
+ * @param[in]  sample: whether to evaluate sample mean or not. In other words, whether
+ *   to normalize the output using N-1 or N, for true or false, respectively
+ */
+template <typename value_t, typename idx_t, typename layout_t>
+[[deprecated("'sample' parameter deprecated")]] void mean(
+  raft::resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, layout_t> data,
+  raft::device_vector_view<value_t, idx_t> mu,
+  bool sample)
 {
   static_assert(
     std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
@@ -96,4 +150,4 @@ void mean(raft::resources const& handle,
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 83f9a8a941..fb9da4dd39 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -163,4 +163,4 @@ void mean_add(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index d2c410dab1..930a6f8b9e 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -141,4 +141,4 @@ void minmax(raft::resources const& handle,
 
 };  // namespace stats
 };  // namespace raft
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 5a334e9280..c895a911e9 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -89,4 +89,4 @@ double mutual_info_score(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index c98b4bc93a..4ff9f491d8 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -90,4 +90,4 @@ value_t r2_score(raft::resources const& handle,
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index a21a0c0dc5..1230d615eb 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -75,4 +75,4 @@ double rand_index(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index 718170f716..74763de2fc 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -104,4 +104,4 @@ void regression_metrics(raft::resources const& handle,
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
index 23eef84604..15d86969af 100644
--- a/cpp/include/raft/stats/silhouette_score.cuh
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -223,4 +223,4 @@ value_t silhouette_score_batched(
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 0a67bd2325..62668b3ddd 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -185,4 +185,4 @@ void vars(raft::resources const& handle,
 };  // namespace stats
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 2c3ed1b83e..6c18a21988 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -88,4 +88,4 @@ void sum(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/trustworthiness_score.cuh b/cpp/include/raft/stats/trustworthiness_score.cuh
index 3f4464f4d3..2435cb4ef9 100644
--- a/cpp/include/raft/stats/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/trustworthiness_score.cuh
@@ -98,4 +98,4 @@ double trustworthiness_score(
 }  // namespace stats
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index 041adb5e38..1df3eab460 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -95,4 +95,4 @@ double v_measure(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index da22f0163c..a3e38f7168 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -189,4 +189,4 @@ void col_weighted_mean(raft::resources const& handle,
 };  // end namespace stats
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml b/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml
index a5411082af..4357c207a1 100644
--- a/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml
+++ b/cpp/include/raft/thirdparty/mdspan/.github/workflows/cmake.yml
@@ -37,27 +37,27 @@ jobs:
 
     - name: Create Build Environment
       run: cmake -E make_directory ${{github.workspace}}/mdspan-build
-      
+
     - name: Check Out
       uses: actions/checkout@v2
       with:
         path: ${{github.workspace}}/mdspan-src
-      
+
     - name: Configure CMake
       shell: bash
       working-directory: ${{github.workspace}}/mdspan-build
       run: CXX=${{ matrix.compiler_prefix}}/${{ matrix.compiler_driver }} cmake $GITHUB_WORKSPACE/mdspan-src -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/mdspan-install -DMDSPAN_ENABLE_TESTS=ON -DMDSPAN_ENABLE_EXAMPLES=ON
-      
+
     - name: Build
       shell: bash
       working-directory: ${{github.workspace}}/mdspan-build
       run: make -j
-      
+
     - name: Test
       working-directory: ${{github.workspace}}/mdspan-build
       shell: bash
       run: ctest
-            
+
     - name: Install
       shell: bash
       working-directory: ${{github.workspace}}/mdspan-build
diff --git a/cpp/include/raft/thirdparty/mdspan/LICENSE b/cpp/include/raft/thirdparty/mdspan/LICENSE
index c68a8a2a9f..db92c208da 100644
--- a/cpp/include/raft/thirdparty/mdspan/LICENSE
+++ b/cpp/include/raft/thirdparty/mdspan/LICENSE
@@ -1,14 +1,14 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
 //
 // Kokkos is licensed under 3-clause BSD terms of use:
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -37,6 +37,6 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
diff --git a/cpp/include/raft/thirdparty/mdspan/README.md b/cpp/include/raft/thirdparty/mdspan/README.md
index a062777261..15af4dd4a9 100644
--- a/cpp/include/raft/thirdparty/mdspan/README.md
+++ b/cpp/include/raft/thirdparty/mdspan/README.md
@@ -70,4 +70,3 @@ Acknowledgements
 ================
 
 This work was undertaken as part of the [Kokkos project](https://github.com/kokkos/kokkos) at Sandia National Laboratories.  Sandia National Laboratories is a multimission laboratory managed and operated by National Technology & Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International Inc., for the U. S. Department of Energy's National Nuclear Security Administration under contract DE-NA0003525.
-
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt
index 30391b3d70..3d5cbb955a 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/cuda/CMakeLists.txt
@@ -2,4 +2,4 @@
 mdspan_add_cuda_benchmark(sum_3d_cuda)
 target_include_directories(sum_3d_cuda PUBLIC
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/benchmarks/sum>
-)
\ No newline at end of file
+)
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt
index 566c47c9ab..ccab58bfa1 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/CMakeLists.txt
@@ -4,4 +4,4 @@ if(OpenMP_CXX_FOUND)
   target_include_directories(sum_3d_openmp PUBLIC
       $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/benchmarks/sum>
   )
-endif()
\ No newline at end of file
+endif()
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp
index 9ab6a0ddf4..ef75349925 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/openmp/sum_3d_openmp.cpp
@@ -174,4 +174,3 @@ BENCHMARK_CAPTURE(
 //================================================================================
 
 BENCHMARK_MAIN();
-
diff --git a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp
index f106e2f5ff..4cbfe029c7 100644
--- a/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/benchmarks/sum/sum_submdspan_right.cpp
@@ -223,4 +223,3 @@ BENCHMARK_CAPTURE(
 //================================================================================
 
 BENCHMARK_MAIN();
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp
index ea2bad164c..ef45c9d18f 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_compressed_pair_layout.cpp
@@ -169,4 +169,3 @@ test<CP<CP<int*, int*>, CP<int*, int*>>, 4 * sizeof(int*), non_empty>();
 // </editor-fold> end compressed pair layout: 2 nested pairs, 4 leaf elements }}}1
 //==============================================================================
 }
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp
index 00126691aa..64d71d650c 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_extents_ctors.cpp
@@ -176,4 +176,3 @@ MDSPAN_STATIC_TEST(
     stdex::extents<size_t,stdex::dynamic_extent, stdex::dynamic_extent, stdex::dynamic_extent>
   >::value
 );
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp
index e293734444..fc30fa25e5 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_layout_convertible.cpp
@@ -117,5 +117,3 @@ MDSPAN_STATIC_TEST(
 MDSPAN_STATIC_TEST(
   !std::is_constructible<LS1, NotARealLayout::mapping<E2>>::value
 );
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp
index fa1136b9d6..c64fcdbabd 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_mdspan_convertible.cpp
@@ -68,4 +68,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end mdspan }}}1
 //==============================================================================
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp
index 9f7c6c052d..c44b02bf76 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_no_unique_address.cpp
@@ -109,5 +109,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end layouts }}}1
 //==============================================================================
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
index d8edf31ab2..6e41433d6a 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_standard_layout.cpp
@@ -216,6 +216,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end mdspan }}}1
 //==============================================================================
-
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp
index 73ab426afa..f6457234d7 100644
--- a/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/compilation_tests/ctest_trivially_copyable.cpp
@@ -212,6 +212,3 @@ MDSPAN_STATIC_TEST(
 
 // </editor-fold> end mdspan }}}1
 //==============================================================================
-
-
-
diff --git a/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp b/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp
index b8740d5227..ba481c3144 100644
--- a/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/examples/tiled_layout/simple_tiled_layout.cpp
@@ -207,4 +207,3 @@ int main() {
     std::cout << "Success! SimpleTiledLayout2D works as expected." << std::endl;
   }
 }
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
index 67356785c0..02e386e3aa 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
@@ -42,7 +42,7 @@
 */
 
 
-// NOTE: This code is prematurely taken from an example based on 
+// NOTE: This code is prematurely taken from an example based on
 // https://github.com/kokkos/mdspan/pull/176
 
 #pragma once
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp
index 6be71b432c..3b4d69d63e 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/extents.hpp
@@ -531,7 +531,7 @@ struct __extents_to_partially_static_sizes;
 template <class IndexType, size_t... ExtentsPack>
 struct __extents_to_partially_static_sizes<::std::experimental::extents<IndexType, ExtentsPack...>> {
   using type = detail::__partially_static_sizes<
-          typename ::std::experimental::extents<IndexType, ExtentsPack...>::index_type, size_t, 
+          typename ::std::experimental::extents<IndexType, ExtentsPack...>::index_type, size_t,
           ExtentsPack...>;
 };
 
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
index ed1478dc8b..92a291e915 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_left.hpp
@@ -237,4 +237,3 @@ class layout_left::mapping {
 
 } // end namespace experimental
 } // end namespace std
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
index cd9c9c19bf..c761146874 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
@@ -45,7 +45,7 @@
 // NOTE: This code is prematurely taken from https://github.com/kokkos/mdspan/pull/180
 // and matches requirements described in https://github.com/ORNL/cpp-proposals-pub/pull/296
 // Some parts (as submdspan integration) are missing
-// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a 
+// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a
 // fixed stride to a padding alignment, allowing dimensions > padding_stride to be padded
 // to multiples of 'padding_stride'
 
@@ -140,7 +140,7 @@ namespace details {
 // layout_padded_left implementation
 
 namespace details {
-   
+
 
   // The *_helper functions work around not having C++20
   // templated lambdas: []<size_t... TrailingIndices>{} .
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
index a9b64ca36a..d4b71efae1 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_right.hpp
@@ -237,4 +237,3 @@ class layout_right::mapping {
 
 } // end namespace experimental
 } // end namespace std
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
index 904dd40a75..90b1a46288 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/no_unique_address.hpp
@@ -74,13 +74,13 @@ struct __no_unique_address_emulation<
                 // If the type isn't trivially destructible, its destructor
                 // won't be called at the right time, so don't use this
                 // specialization
-                _MDSPAN_TRAIT(is_trivially_destructible, _T)>> : 
+                _MDSPAN_TRAIT(is_trivially_destructible, _T)>> :
 #ifdef _MDSPAN_COMPILER_MSVC
     // MSVC doesn't allow you to access public static member functions of a type
     // when you *happen* to privately inherit from that type.
     protected
 #else
-    // But we still want this to be private if possible so that we don't accidentally 
+    // But we still want this to be private if possible so that we don't accidentally
     // access members of _T directly rather than calling __ref() first, which wouldn't
     // work if _T happens to be stateful and thus we're using the unspecialized definition
     // of __no_unique_address_emulation above.
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp
index 7de72e6537..64845190ae 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/type_list.hpp
@@ -114,4 +114,3 @@ struct __type_at<3, __type_list<_T0, _T1, _T2, _T3, _Ts...>> {
 
 } // end namespace experimental
 } // end namespace std
-
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray
index fa710a59b6..60e06dd68e 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdarray
@@ -45,4 +45,3 @@
 
 #include "mdspan"
 #include "__p1684_bits/mdarray.hpp"
-
diff --git a/cpp/include/raft/thirdparty/mdspan/make_single_header.py b/cpp/include/raft/thirdparty/mdspan/make_single_header.py
index 1b562c7176..98ab3526db 100755
--- a/cpp/include/raft/thirdparty/mdspan/make_single_header.py
+++ b/cpp/include/raft/thirdparty/mdspan/make_single_header.py
@@ -49,4 +49,3 @@ def process_file(file_path, out_lines=[], front_matter_lines=[], back_matter_lin
        "#define _MDSPAN_SINGLE_HEADER_INCLUDE_GUARD_\n"],
       ["#endif // _MDSPAN_SINGLE_HEADER_INCLUDE_GUARD_\n"],
       [abspath(sys.argv[1])]))
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt b/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt
index d92834beb7..a30ce2c198 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt
+++ b/cpp/include/raft/thirdparty/mdspan/tests/CMakeLists.txt
@@ -57,4 +57,3 @@ mdspan_add_test(test_layout_ctors)
 mdspan_add_test(test_layout_stride)
 mdspan_add_test(test_submdspan)
 mdspan_add_test(test_mdarray_ctors)
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp
index f09b799684..e91896c1c4 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_exhaustive_layouts.cpp
@@ -424,4 +424,3 @@ TYPED_TEST(TestLayoutConversion, implicit_conversion) {
     ASSERT_EQ(map1.stride(r), map2.stride(r));
   }
 }
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp
index 3a3e1c2696..12008f05cf 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_layout_stride.cpp
@@ -164,4 +164,3 @@ TEST(TestLayoutStrideCTAD, test_ctad) {
 */
 }
 #endif
-
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp
index 781a12a697..3dcb61d454 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_mdarray_ctors.cpp
@@ -740,7 +740,7 @@ TEST(TestMdarrayCTAD, layout_stride) {
   ASSERT_EQ(m0.stride(1), 128);
   ASSERT_FALSE(m0.is_exhaustive());
 
-  /* 
+  /*
   stdex::mdarray m1{d.data(), stdex::layout_stride::mapping{stdex::extents{16, 32}, stdex::extents{1, 128}}};
   ASSERT_EQ(m1.data(), d.data());
   ASSERT_EQ(m1.rank(), 2);
diff --git a/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp b/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp
index 81d3fdb983..14ae51a259 100644
--- a/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp
+++ b/cpp/include/raft/thirdparty/mdspan/tests/test_mdspan_ctors.cpp
@@ -346,7 +346,7 @@ TEST(TestMdspanCTAD, layout_stride) {
   ASSERT_EQ(m0.stride(1), 128);
   ASSERT_FALSE(m0.is_exhaustive());
 
-  /* 
+  /*
   stdex::mdspan m1{d.data(), stdex::layout_stride::mapping{stdex::extents{16, 32}, stdex::extents{1, 128}}};
   ASSERT_EQ(m1.data(), d.data());
   ASSERT_EQ(m1.rank(), 2);
diff --git a/cpp/include/raft/util/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh
index f335be6fd0..9638a261a5 100644
--- a/cpp/include/raft/util/detail/popc.cuh
+++ b/cpp/include/raft/util/detail/popc.cuh
@@ -73,4 +73,4 @@ void popc(const raft::resources& res,
     });
 }
 
-}  // end namespace raft::detail
\ No newline at end of file
+}  // end namespace raft::detail
diff --git a/cpp/include/raft/util/device_loads_stores.cuh b/cpp/include/raft/util/device_loads_stores.cuh
index 2c954ec99a..c1b668fed6 100644
--- a/cpp/include/raft/util/device_loads_stores.cuh
+++ b/cpp/include/raft/util/device_loads_stores.cuh
@@ -739,4 +739,46 @@ DI void block_copy(raft::device_span<T> dst, const raft::device_span<T> src)
 
 /** @} */
 
+/**
+ * @defgroup GlobalStores Global Store Operations
+ * @{
+ * @brief Perform conditional stores to global memory.
+ *
+ * These functions store data to a specified global memory address,
+ * controlled by a guard flag to enable conditional execution.
+ *
+ * @param[in] reg   The data to store in global memory.
+ *                  The type of `reg` determines the size of the store.
+ * @param[in] addr  The global memory address where the data will be stored.
+ * @param[in] guard A flag to conditionally enable the store operation.
+ *                  If `true`, the store is performed; otherwise, it is skipped
+ */
+DI void stg(const int& reg, void* addr, bool guard)
+{
+  asm volatile(
+    "{\n"
+    ".reg .pred p;\n"
+    "setp.ne.b32 p, %2, 0;\n"
+    "@p st.global.b32 [%0], %1;\n"
+    "}\n"
+    :
+    : "l"(addr), "r"(reg), "r"((int)guard)
+    : "memory");
+}
+
+DI void stg(const int64_t& reg, void* addr, bool guard)
+{
+  asm volatile(
+    "{\n"
+    ".reg .pred p;\n"
+    "setp.ne.b32 p, %2, 0;\n"
+    "@p st.global.b64 [%0], %1;\n"
+    "}\n"
+    :
+    : "l"(addr), "l"(reg), "r"((int)guard)
+    : "memory");
+}
+
+/** @} */
+
 }  // namespace raft
diff --git a/cpp/include/raft/util/input_validation.hpp b/cpp/include/raft/util/input_validation.hpp
index 17bb53f22b..119fd9d2e2 100644
--- a/cpp/include/raft/util/input_validation.hpp
+++ b/cpp/include/raft/util/input_validation.hpp
@@ -129,4 +129,4 @@ constexpr bool is_scalar_view(mdspan<ElementType, Extents> m)
   return false;
 }
 
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index 5224d5ac4c..7ea5a3d212 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -25,6 +25,7 @@
 
 #include <raft/core/detail/macros.hpp>
 
+#include <cstdint>
 #include <limits>
 #include <stdexcept>
 #include <type_traits>
diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp
index 493ac9befe..a31d9f79df 100644
--- a/cpp/include/raft/util/itertools.hpp
+++ b/cpp/include/raft/util/itertools.hpp
@@ -36,7 +36,7 @@ namespace raft::util::itertools {
  *              fields of the structure (if the structure has more fields, some might be initialized
  *              with their default value).
  * @param lists One or more initializer lists.
- * @return std::vector<S> A vector of structures containing the cartesian product.
+ * @return `std::vector<S>` A vector of structures containing the cartesian product.
  */
 template <typename S, typename... Args>
 std::vector<S> product(std::initializer_list<Args>... lists)
diff --git a/cpp/include/raft/util/warp_primitives.cuh b/cpp/include/raft/util/warp_primitives.cuh
index 953c137cdf..2a7c4e9127 100644
--- a/cpp/include/raft/util/warp_primitives.cuh
+++ b/cpp/include/raft/util/warp_primitives.cuh
@@ -256,4 +256,4 @@ DI std::enable_if_t<!is_shuffleable_v<T>, T> shfl_xor(T val,
   return output;
 }
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/scripts/run-clang-compile.py b/cpp/scripts/run-clang-compile.py
index 123f0e4075..8ed9aa00f0 100644
--- a/cpp/scripts/run-clang-compile.py
+++ b/cpp/scripts/run-clang-compile.py
@@ -253,12 +253,12 @@ def run_clang_command(clang_cmd, cwd):
 class LockContext(object):
     def __init__(self, lock=None) -> None:
         self._lock = lock
-    
+
     def __enter__(self):
         if self._lock:
             self._lock.acquire()
         return self
-    
+
     def __exit__(self, _, __, ___):
         if self._lock:
             self._lock.release()
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index 3d8bbcec4a..cad08ca551 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -296,12 +296,12 @@ def run_clang_tidy_command(tidy_cmd, cwd):
 class LockContext(object):
     def __init__(self, lock=None) -> None:
         self._lock = lock
-    
+
     def __enter__(self):
         if self._lock:
             self._lock.acquire()
         return self
-    
+
     def __exit__(self, _, __, ___):
         if self._lock:
             self._lock.release()
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index db5a8b5804..e08481fbd6 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -17,7 +17,7 @@
 # and exits gracefully if the file is not found. If a user wishes to specify a
 # config file at a nonstandard location, they may do so by setting the
 # environment variable RAPIDS_CMAKE_FORMAT_FILE.
-# 
+#
 # This script can be invoked directly anywhere within the project repository.
 # Alternatively, it may be invoked as a pre-commit hook via
 # `pre-commit run (cmake-format)|(cmake-lint)`.
diff --git a/cpp/src/core/logger.cpp b/cpp/src/core/logger.cpp
deleted file mode 100644
index 8f81cf2926..0000000000
--- a/cpp/src/core/logger.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <raft/core/logger-inl.hpp>
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
deleted file mode 100644
index b44d87d1bd..0000000000
--- a/cpp/test/stats/mean_center.cu
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../linalg/matrix_vector_op.cuh"
-#include "../test_utils.cuh"
-
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
-#include <raft/util/cudart_utils.hpp>
-
-#include <gtest/gtest.h>
-
-namespace raft {
-namespace stats {
-
-template <typename T, typename IdxType>
-struct MeanCenterInputs {
-  T tolerance, mean;
-  IdxType rows, cols;
-  bool sample, rowMajor, bcastAlongRows;
-  unsigned long long int seed;
-};
-
-template <typename T, typename IdxType>
-::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs<T, IdxType>& dims)
-{
-  return os;
-}
-
-template <typename T, typename IdxType>
-class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
- public:
-  MeanCenterTest()
-    : params(::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
-      stream(resource::get_cuda_stream(handle)),
-      rows(params.rows),
-      cols(params.cols),
-      out(rows * cols, stream),
-      out_ref(rows * cols, stream),
-      data(rows * cols, stream),
-      meanVec(params.bcastAlongRows ? cols : rows, stream)
-  {
-  }
-
- protected:
-  void SetUp() override
-  {
-    raft::random::RngState r(params.seed);
-    auto len         = rows * cols;
-    auto meanVecSize = params.bcastAlongRows ? cols : rows;
-    normal(handle, r, data.data(), len, params.mean, (T)1.0);
-    raft::stats::mean(
-      meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      mean_center(handle,
-                  raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
-                  raft::make_device_vector_view<const T, int>(meanVec.data(), meanVecSize),
-                  raft::make_device_matrix_view<T, int, layout>(out.data(), rows, cols),
-                  params.bcastAlongRows);
-    } else {
-      using layout = raft::col_major;
-      mean_center(handle,
-                  raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
-                  raft::make_device_vector_view<const T, int>(meanVec.data(), meanVecSize),
-                  raft::make_device_matrix_view<T, int, layout>(out.data(), rows, cols),
-                  params.bcastAlongRows);
-    }
-    raft::linalg::naiveMatVec(out_ref.data(),
-                              data.data(),
-                              meanVec.data(),
-                              cols,
-                              rows,
-                              params.rowMajor,
-                              params.bcastAlongRows,
-                              (T)-1.0,
-                              stream);
-    resource::sync_stream(handle, stream);
-  }
-
- protected:
-  raft::resources handle;
-  cudaStream_t stream;
-
-  MeanCenterInputs<T, IdxType> params;
-  int rows, cols;
-  rmm::device_uvector<T> data, meanVec, out, out_ref;
-};
-
-const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
-  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
-typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
-TEST_P(MeanCenterTestF_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
-
-const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
-  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
-typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
-TEST_P(MeanCenterTestF_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
-
-const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
-  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
-typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
-TEST_P(MeanCenterTestD_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
-
-const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
-  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
-typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
-TEST_P(MeanCenterTestD_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64));
-
-}  // end namespace stats
-}  // end namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/tests/CMakeLists.txt
similarity index 97%
rename from cpp/test/CMakeLists.txt
rename to cpp/tests/CMakeLists.txt
index 621ee6c160..9f96b93e7a 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -55,6 +55,7 @@ function(ConfigureTest)
             ${RAFT_CTK_MATH_DEPENDENCIES}
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
+            raft_test_logger
   )
   set_target_properties(
     ${TEST_NAME}
@@ -76,7 +77,7 @@ function(ConfigureTest)
     target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA")
   endif()
 
-  target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
+  target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/tests>")
 
   rapids_test_add(
     NAME ${TEST_NAME}
@@ -87,6 +88,10 @@ function(ConfigureTest)
   )
 endfunction()
 
+# Create an object library for the logger so that we don't have to recompile it.
+add_library(raft_test_logger OBJECT)
+target_link_libraries(raft_test_logger PRIVATE raft_logger_impl)
+
 # ##################################################################################################
 # test sources ##################################################################################
 # ##################################################################################################
diff --git a/cpp/test/core/bitmap.cu b/cpp/tests/core/bitmap.cu
similarity index 100%
rename from cpp/test/core/bitmap.cu
rename to cpp/tests/core/bitmap.cu
diff --git a/cpp/test/core/bitset.cu b/cpp/tests/core/bitset.cu
similarity index 72%
rename from cpp/test/core/bitset.cu
rename to cpp/tests/core/bitset.cu
index ac601274c1..f094f60ded 100644
--- a/cpp/test/core/bitset.cu
+++ b/cpp/tests/core/bitset.cu
@@ -24,6 +24,8 @@
 #include <gtest/gtest.h>
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <numeric>
 
 namespace raft::core {
@@ -73,6 +75,40 @@ void test_cpu_bitset(const std::vector<bitset_t>& bitset,
   }
 }
 
+template <typename bitset_t, typename index_t>
+void test_cpu_bitset_nbits(const bitset_t* bitset,
+                           const std::vector<index_t>& queries,
+                           std::vector<uint8_t>& result,
+                           unsigned original_nbits_)
+{
+  constexpr size_t nbits = sizeof(bitset_t) * 8;
+  if (original_nbits_ == nbits) {
+    for (size_t i = 0; i < queries.size(); i++) {
+      result[i] =
+        uint8_t((bitset[queries[i] / nbits] & (bitset_t{1} << (queries[i] % nbits))) != 0);
+    }
+  }
+  for (size_t i = 0; i < queries.size(); i++) {
+    const index_t sample_index        = queries[i];
+    const index_t original_bit_index  = sample_index / original_nbits_;
+    const index_t original_bit_offset = sample_index % original_nbits_;
+    index_t new_bit_index             = original_bit_index * original_nbits_ / nbits;
+    index_t new_bit_offset            = 0;
+    if (original_nbits_ > nbits) {
+      new_bit_index += original_bit_offset / nbits;
+      new_bit_offset = original_bit_offset % nbits;
+    } else {
+      index_t ratio = nbits / original_nbits_;
+      new_bit_offset += (original_bit_index % ratio) * original_nbits_;
+      new_bit_offset += original_bit_offset % nbits;
+    }
+    const bitset_t bit_element = bitset[new_bit_index];
+    const bool is_bit_set      = (bit_element & (bitset_t{1} << new_bit_offset)) != 0;
+
+    result[i] = uint8_t(is_bit_set);
+  }
+}
+
 template <typename bitset_t>
 void flip_cpu_bitset(std::vector<bitset_t>& bitset)
 {
@@ -168,11 +204,12 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
-    auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
-    auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
-    auto query_cpu     = std::vector<index_t>(spec.query_len);
-    auto result_cpu    = std::vector<uint8_t>(spec.query_len);
-    auto result_ref    = std::vector<uint8_t>(spec.query_len);
+    auto query_device     = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
+    auto result_device    = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
+    auto query_cpu        = std::vector<index_t>(spec.query_len);
+    auto result_cpu       = std::vector<uint8_t>(spec.query_len);
+    auto result_ref_nbits = std::vector<uint8_t>(spec.query_len);
+    auto result_ref       = std::vector<uint8_t>(spec.query_len);
 
     // Create queries and verify the test results
     raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
@@ -194,6 +231,57 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
+    // Reinterpret the bitset as uint8_t, uint32 then uint64_t
+    {
+      // Test CPU logic
+      test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+      uint8_t* bitset_cpu_uint8 = (uint8_t*)std::malloc(sizeof(bitset_t) * bitset_ref.size());
+      std::memcpy(bitset_cpu_uint8, bitset_ref.data(), sizeof(bitset_t) * bitset_ref.size());
+      test_cpu_bitset_nbits(bitset_cpu_uint8, query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
+      std::free(bitset_cpu_uint8);
+
+      // Test GPU uint8_t, uint32_t, uint64_t
+      auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
+        reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint8_t] __device__(index_t query) {
+          return my_bitset_view_uint8_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+
+      auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
+        reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint32_t] __device__(index_t query) {
+          return my_bitset_view_uint32_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+
+      auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
+        reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint64_t] __device__(index_t query) {
+          return my_bitset_view_uint64_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+    }
+
     // test sparsity, repeat and eval_n_elements
     {
       auto my_bitset_view  = my_bitset.view();
diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/tests/core/device_resources_manager.cpp
similarity index 98%
rename from cpp/test/core/device_resources_manager.cpp
rename to cpp/tests/core/device_resources_manager.cpp
index c63d5896e5..007b57378f 100644
--- a/cpp/test/core/device_resources_manager.cpp
+++ b/cpp/tests/core/device_resources_manager.cpp
@@ -89,7 +89,7 @@ TEST(DeviceResourcesManager, ObeysSetters)
 
   // Suppress the many warnings from testing use of setters after initial
   // get_device_resources call
-  auto scoped_log_level = log_level_setter{RAFT_LEVEL_ERROR};
+  auto scoped_log_level = log_level_setter{level_enum::error};
 
   omp_set_dynamic(0);
 #pragma omp parallel for num_threads(5)
diff --git a/cpp/test/core/device_setter.cpp b/cpp/tests/core/device_setter.cpp
similarity index 100%
rename from cpp/test/core/device_setter.cpp
rename to cpp/tests/core/device_setter.cpp
diff --git a/cpp/test/core/handle.cpp b/cpp/tests/core/handle.cpp
similarity index 100%
rename from cpp/test/core/handle.cpp
rename to cpp/tests/core/handle.cpp
diff --git a/cpp/test/core/interruptible.cu b/cpp/tests/core/interruptible.cu
similarity index 100%
rename from cpp/test/core/interruptible.cu
rename to cpp/tests/core/interruptible.cu
diff --git a/cpp/test/core/logger.cpp b/cpp/tests/core/logger.cpp
similarity index 52%
rename from cpp/test/core/logger.cpp
rename to cpp/tests/core/logger.cpp
index 7f31beed71..10adb71dda 100644
--- a/cpp/test/core/logger.cpp
+++ b/cpp/tests/core/logger.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-// We set RAFT_ACTIVE_LEVEL to a value that would enable testing trace and debug logs
+// We set RAFT_LOG_ACTIVE_LEVEL to a value that would enable testing trace and debug logs
 // (otherwise trace and debug logs are desabled by default).
-#undef RAFT_ACTIVE_LEVEL
-#define RAFT_ACTIVE_LEVEL 6
+#undef RAFT_LOG_ACTIVE_LEVEL
+#define RAFT_LOG_ACTIVE_LEVEL RAFT_LOG_LEVEL_TRACE
 
 #include <raft/core/logger.hpp>
 
@@ -34,15 +34,15 @@ TEST(logger, Test)
   RAFT_LOG_WARN("This is a warning message");
   RAFT_LOG_INFO("This is an info message");
 
-  logger::get(RAFT_NAME).set_level(RAFT_LEVEL_WARN);
-  ASSERT_EQ(RAFT_LEVEL_WARN, logger::get(RAFT_NAME).get_level());
-  logger::get(RAFT_NAME).set_level(RAFT_LEVEL_INFO);
-  ASSERT_EQ(RAFT_LEVEL_INFO, logger::get(RAFT_NAME).get_level());
+  default_logger().set_level(raft::level_enum::warn);
+  ASSERT_EQ(raft::level_enum::warn, default_logger().level());
+  default_logger().set_level(raft::level_enum::info);
+  ASSERT_EQ(raft::level_enum::info, default_logger().level());
 
-  ASSERT_FALSE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_TRACE));
-  ASSERT_FALSE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_DEBUG));
-  ASSERT_TRUE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_INFO));
-  ASSERT_TRUE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_WARN));
+  ASSERT_FALSE(default_logger().should_log(raft::level_enum::trace));
+  ASSERT_FALSE(default_logger().should_log(raft::level_enum::debug));
+  ASSERT_TRUE(default_logger().should_log(raft::level_enum::info));
+  ASSERT_TRUE(default_logger().should_log(raft::level_enum::warn));
 }
 
 std::string logged = "";
@@ -57,60 +57,61 @@ class loggerTest : public ::testing::Test {
   {
     flushCount = 0;
     logged     = "";
-    logger::get(RAFT_NAME).set_level(RAFT_LEVEL_TRACE);
+    default_logger().set_level(raft::level_enum::trace);
   }
 
   void TearDown() override
   {
-    logger::get(RAFT_NAME).set_callback(nullptr);
-    logger::get(RAFT_NAME).set_flush(nullptr);
-    logger::get(RAFT_NAME).set_level(RAFT_LEVEL_INFO);
+    default_logger().sinks().pop_back();
+    default_logger().set_level(raft::level_enum::info);
   }
 };
 
-// The logging macros depend on `RAFT_ACTIVE_LEVEL` as well as the logger verbosity;
-// The verbosity is set to `RAFT_LEVEL_TRACE`, but `RAFT_ACTIVE_LEVEL` is set outside of here.
-auto check_if_logged(const std::string& msg, int log_level_def) -> bool
+// The logging macros depend on `RAFT_LOG_ACTIVE_LEVEL` as well as the logger verbosity;
+// The verbosity is set to `RAFT_LOG_LEVEL_TRACE`, but `RAFT_LOG_ACTIVE_LEVEL` is set outside of
+// here.
+auto check_if_logged(const std::string& msg, raft::level_enum log_level_def) -> bool
 {
   bool actually_logged  = logged.find(msg) != std::string::npos;
-  bool should_be_logged = RAFT_ACTIVE_LEVEL >= log_level_def;
+  bool should_be_logged = RAFT_LOG_ACTIVE_LEVEL <= static_cast<int>(log_level_def);
   return actually_logged == should_be_logged;
 }
 
 TEST_F(loggerTest, callback)
 {
   std::string testMsg;
-  logger::get(RAFT_NAME).set_callback(exampleCallback);
+  default_logger().sinks().push_back(std::make_shared<callback_sink_mt>(exampleCallback));
 
   testMsg = "This is a critical message";
   RAFT_LOG_CRITICAL(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_CRITICAL));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::critical));
 
   testMsg = "This is an error message";
   RAFT_LOG_ERROR(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_ERROR));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::error));
 
   testMsg = "This is a warning message";
   RAFT_LOG_WARN(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_WARN));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::warn));
 
   testMsg = "This is an info message";
   RAFT_LOG_INFO(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_INFO));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::info));
 
   testMsg = "This is a debug message";
   RAFT_LOG_DEBUG(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_DEBUG));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::debug));
 
   testMsg = "This is a trace message";
   RAFT_LOG_TRACE(testMsg.c_str());
-  ASSERT_TRUE(check_if_logged(testMsg, RAFT_LEVEL_TRACE));
+  ASSERT_TRUE(check_if_logged(testMsg, raft::level_enum::trace));
 }
 
 TEST_F(loggerTest, flush)
 {
-  logger::get(RAFT_NAME).set_flush(exampleFlush);
-  logger::get(RAFT_NAME).flush();
+  default_logger().sinks().push_back(
+    std::make_shared<callback_sink_mt>(exampleCallback, exampleFlush));
+  default_logger().flush();
   ASSERT_EQ(1, flushCount);
 }
 
diff --git a/cpp/test/core/math_device.cu b/cpp/tests/core/math_device.cu
similarity index 100%
rename from cpp/test/core/math_device.cu
rename to cpp/tests/core/math_device.cu
diff --git a/cpp/test/core/math_host.cpp b/cpp/tests/core/math_host.cpp
similarity index 100%
rename from cpp/test/core/math_host.cpp
rename to cpp/tests/core/math_host.cpp
diff --git a/cpp/test/core/mdarray.cu b/cpp/tests/core/mdarray.cu
similarity index 100%
rename from cpp/test/core/mdarray.cu
rename to cpp/tests/core/mdarray.cu
diff --git a/cpp/test/core/mdbuffer.cu b/cpp/tests/core/mdbuffer.cu
similarity index 100%
rename from cpp/test/core/mdbuffer.cu
rename to cpp/tests/core/mdbuffer.cu
diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/tests/core/mdspan_copy.cpp
similarity index 100%
rename from cpp/test/core/mdspan_copy.cpp
rename to cpp/tests/core/mdspan_copy.cpp
diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/tests/core/mdspan_copy.cu
similarity index 100%
rename from cpp/test/core/mdspan_copy.cu
rename to cpp/tests/core/mdspan_copy.cu
diff --git a/cpp/test/core/mdspan_utils.cu b/cpp/tests/core/mdspan_utils.cu
similarity index 100%
rename from cpp/test/core/mdspan_utils.cu
rename to cpp/tests/core/mdspan_utils.cu
diff --git a/cpp/test/core/memory_type.cpp b/cpp/tests/core/memory_type.cpp
similarity index 100%
rename from cpp/test/core/memory_type.cpp
rename to cpp/tests/core/memory_type.cpp
diff --git a/cpp/test/core/numpy_serializer.cu b/cpp/tests/core/numpy_serializer.cu
similarity index 100%
rename from cpp/test/core/numpy_serializer.cu
rename to cpp/tests/core/numpy_serializer.cu
diff --git a/cpp/test/core/nvtx.cpp b/cpp/tests/core/nvtx.cpp
similarity index 100%
rename from cpp/test/core/nvtx.cpp
rename to cpp/tests/core/nvtx.cpp
diff --git a/cpp/test/core/operators_device.cu b/cpp/tests/core/operators_device.cu
similarity index 100%
rename from cpp/test/core/operators_device.cu
rename to cpp/tests/core/operators_device.cu
diff --git a/cpp/test/core/operators_host.cpp b/cpp/tests/core/operators_host.cpp
similarity index 100%
rename from cpp/test/core/operators_host.cpp
rename to cpp/tests/core/operators_host.cpp
diff --git a/cpp/test/core/seive.cu b/cpp/tests/core/seive.cu
similarity index 100%
rename from cpp/test/core/seive.cu
rename to cpp/tests/core/seive.cu
diff --git a/cpp/test/core/span.cpp b/cpp/tests/core/span.cpp
similarity index 100%
rename from cpp/test/core/span.cpp
rename to cpp/tests/core/span.cpp
diff --git a/cpp/test/core/span.cu b/cpp/tests/core/span.cu
similarity index 100%
rename from cpp/test/core/span.cu
rename to cpp/tests/core/span.cu
diff --git a/cpp/test/core/sparse_matrix.cpp b/cpp/tests/core/sparse_matrix.cpp
similarity index 100%
rename from cpp/test/core/sparse_matrix.cpp
rename to cpp/tests/core/sparse_matrix.cpp
diff --git a/cpp/test/core/sparse_matrix.cu b/cpp/tests/core/sparse_matrix.cu
similarity index 100%
rename from cpp/test/core/sparse_matrix.cu
rename to cpp/tests/core/sparse_matrix.cu
diff --git a/cpp/test/core/stream_view.cpp b/cpp/tests/core/stream_view.cpp
similarity index 100%
rename from cpp/test/core/stream_view.cpp
rename to cpp/tests/core/stream_view.cpp
diff --git a/cpp/test/core/temporary_device_buffer.cu b/cpp/tests/core/temporary_device_buffer.cu
similarity index 100%
rename from cpp/test/core/temporary_device_buffer.cu
rename to cpp/tests/core/temporary_device_buffer.cu
diff --git a/cpp/test/core/test_span.hpp b/cpp/tests/core/test_span.hpp
similarity index 100%
rename from cpp/test/core/test_span.hpp
rename to cpp/tests/core/test_span.hpp
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/tests/ext_headers/00_generate.py
similarity index 100%
rename from cpp/test/ext_headers/00_generate.py
rename to cpp/tests/ext_headers/00_generate.py
diff --git a/cpp/test/ext_headers/raft_core_logger.cpp b/cpp/tests/ext_headers/raft_core_logger.cpp
similarity index 100%
rename from cpp/test/ext_headers/raft_core_logger.cpp
rename to cpp/tests/ext_headers/raft_core_logger.cpp
diff --git a/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu b/cpp/tests/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
rename to cpp/tests/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
diff --git a/cpp/test/ext_headers/raft_distance_distance.cu b/cpp/tests/ext_headers/raft_distance_distance.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_distance_distance.cu
rename to cpp/tests/ext_headers/raft_distance_distance.cu
diff --git a/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu b/cpp/tests/ext_headers/raft_distance_fused_l2_nn.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
rename to cpp/tests/ext_headers/raft_distance_fused_l2_nn.cu
diff --git a/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu b/cpp/tests/ext_headers/raft_linalg_detail_coalesced_reduction.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
rename to cpp/tests/ext_headers/raft_linalg_detail_coalesced_reduction.cu
diff --git a/cpp/test/ext_headers/raft_matrix_detail_select_k.cu b/cpp/tests/ext_headers/raft_matrix_detail_select_k.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_matrix_detail_select_k.cu
rename to cpp/tests/ext_headers/raft_matrix_detail_select_k.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_ball_cover.cu b/cpp/tests/ext_headers/raft_neighbors_ball_cover.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_ball_cover.cu
rename to cpp/tests/ext_headers/raft_neighbors_ball_cover.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_brute_force.cu b/cpp/tests/ext_headers/raft_neighbors_brute_force.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_brute_force.cu
rename to cpp/tests/ext_headers/raft_neighbors_brute_force.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu b/cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
rename to cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu b/cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
rename to cpp/tests/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu b/cpp/tests/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
rename to cpp/tests/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu b/cpp/tests/ext_headers/raft_neighbors_ivf_flat.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
rename to cpp/tests/ext_headers/raft_neighbors_ivf_flat.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu b/cpp/tests/ext_headers/raft_neighbors_ivf_pq.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
rename to cpp/tests/ext_headers/raft_neighbors_ivf_pq.cu
diff --git a/cpp/test/ext_headers/raft_neighbors_refine.cu b/cpp/tests/ext_headers/raft_neighbors_refine.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_neighbors_refine.cu
rename to cpp/tests/ext_headers/raft_neighbors_refine.cu
diff --git a/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu b/cpp/tests/ext_headers/raft_sparse_matrix_detail_select_k.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
rename to cpp/tests/ext_headers/raft_sparse_matrix_detail_select_k.cu
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu b/cpp/tests/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
rename to cpp/tests/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu b/cpp/tests/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
similarity index 100%
rename from cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
rename to cpp/tests/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
diff --git a/cpp/test/label/label.cu b/cpp/tests/label/label.cu
similarity index 98%
rename from cpp/test/label/label.cu
rename to cpp/tests/label/label.cu
index 4c3479182f..34a336de59 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/tests/label/label.cu
@@ -59,8 +59,8 @@ TEST_F(MakeMonotonicTest, Result)
 
   ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare<bool>(), stream));
 
-  delete data_h;
-  delete expected_h;
+  delete[] data_h;
+  delete[] expected_h;
 }
 
 TEST(labelTest, Classlabels)
diff --git a/cpp/test/label/merge_labels.cu b/cpp/tests/label/merge_labels.cu
similarity index 100%
rename from cpp/test/label/merge_labels.cu
rename to cpp/tests/label/merge_labels.cu
diff --git a/cpp/test/lap/lap.cu b/cpp/tests/lap/lap.cu
similarity index 100%
rename from cpp/test/lap/lap.cu
rename to cpp/tests/lap/lap.cu
diff --git a/cpp/test/linalg/add.cu b/cpp/tests/linalg/add.cu
similarity index 100%
rename from cpp/test/linalg/add.cu
rename to cpp/tests/linalg/add.cu
diff --git a/cpp/test/linalg/add.cuh b/cpp/tests/linalg/add.cuh
similarity index 100%
rename from cpp/test/linalg/add.cuh
rename to cpp/tests/linalg/add.cuh
diff --git a/cpp/test/linalg/axpy.cu b/cpp/tests/linalg/axpy.cu
similarity index 100%
rename from cpp/test/linalg/axpy.cu
rename to cpp/tests/linalg/axpy.cu
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/tests/linalg/binary_op.cu
similarity index 100%
rename from cpp/test/linalg/binary_op.cu
rename to cpp/tests/linalg/binary_op.cu
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/tests/linalg/binary_op.cuh
similarity index 100%
rename from cpp/test/linalg/binary_op.cuh
rename to cpp/tests/linalg/binary_op.cuh
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/tests/linalg/cholesky_r1.cu
similarity index 99%
rename from cpp/test/linalg/cholesky_r1.cu
rename to cpp/tests/linalg/cholesky_r1.cu
index f87e07402f..e506c89a79 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/tests/linalg/cholesky_r1.cu
@@ -170,4 +170,4 @@ TYPED_TEST(CholeskyR1Test, update) { this->testR1Update(); }
 TYPED_TEST(CholeskyR1Test, throwError) { this->testR1Error(); }
 
 };  // namespace linalg
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/tests/linalg/coalesced_reduction.cu
similarity index 100%
rename from cpp/test/linalg/coalesced_reduction.cu
rename to cpp/tests/linalg/coalesced_reduction.cu
diff --git a/cpp/test/linalg/divide.cu b/cpp/tests/linalg/divide.cu
similarity index 100%
rename from cpp/test/linalg/divide.cu
rename to cpp/tests/linalg/divide.cu
diff --git a/cpp/test/linalg/dot.cu b/cpp/tests/linalg/dot.cu
similarity index 100%
rename from cpp/test/linalg/dot.cu
rename to cpp/tests/linalg/dot.cu
diff --git a/cpp/test/linalg/eig.cu b/cpp/tests/linalg/eig.cu
similarity index 100%
rename from cpp/test/linalg/eig.cu
rename to cpp/tests/linalg/eig.cu
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/tests/linalg/eig_sel.cu
similarity index 100%
rename from cpp/test/linalg/eig_sel.cu
rename to cpp/tests/linalg/eig_sel.cu
diff --git a/cpp/test/linalg/eigen_solvers.cu b/cpp/tests/linalg/eigen_solvers.cu
similarity index 100%
rename from cpp/test/linalg/eigen_solvers.cu
rename to cpp/tests/linalg/eigen_solvers.cu
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/tests/linalg/eltwise.cu
similarity index 100%
rename from cpp/test/linalg/eltwise.cu
rename to cpp/tests/linalg/eltwise.cu
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/tests/linalg/gemm_layout.cu
similarity index 100%
rename from cpp/test/linalg/gemm_layout.cu
rename to cpp/tests/linalg/gemm_layout.cu
diff --git a/cpp/test/linalg/gemv.cu b/cpp/tests/linalg/gemv.cu
similarity index 100%
rename from cpp/test/linalg/gemv.cu
rename to cpp/tests/linalg/gemv.cu
diff --git a/cpp/test/linalg/map.cu b/cpp/tests/linalg/map.cu
similarity index 100%
rename from cpp/test/linalg/map.cu
rename to cpp/tests/linalg/map.cu
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/tests/linalg/map_then_reduce.cu
similarity index 100%
rename from cpp/test/linalg/map_then_reduce.cu
rename to cpp/tests/linalg/map_then_reduce.cu
diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/tests/linalg/matrix_vector.cu
similarity index 100%
rename from cpp/test/linalg/matrix_vector.cu
rename to cpp/tests/linalg/matrix_vector.cu
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/tests/linalg/matrix_vector_op.cu
similarity index 100%
rename from cpp/test/linalg/matrix_vector_op.cu
rename to cpp/tests/linalg/matrix_vector_op.cu
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/tests/linalg/matrix_vector_op.cuh
similarity index 100%
rename from cpp/test/linalg/matrix_vector_op.cuh
rename to cpp/tests/linalg/matrix_vector_op.cuh
diff --git a/cpp/test/linalg/mean_squared_error.cu b/cpp/tests/linalg/mean_squared_error.cu
similarity index 100%
rename from cpp/test/linalg/mean_squared_error.cu
rename to cpp/tests/linalg/mean_squared_error.cu
diff --git a/cpp/test/linalg/multiply.cu b/cpp/tests/linalg/multiply.cu
similarity index 100%
rename from cpp/test/linalg/multiply.cu
rename to cpp/tests/linalg/multiply.cu
diff --git a/cpp/test/linalg/norm.cu b/cpp/tests/linalg/norm.cu
similarity index 100%
rename from cpp/test/linalg/norm.cu
rename to cpp/tests/linalg/norm.cu
diff --git a/cpp/test/linalg/normalize.cu b/cpp/tests/linalg/normalize.cu
similarity index 100%
rename from cpp/test/linalg/normalize.cu
rename to cpp/tests/linalg/normalize.cu
diff --git a/cpp/test/linalg/power.cu b/cpp/tests/linalg/power.cu
similarity index 100%
rename from cpp/test/linalg/power.cu
rename to cpp/tests/linalg/power.cu
diff --git a/cpp/test/linalg/randomized_svd.cu b/cpp/tests/linalg/randomized_svd.cu
similarity index 100%
rename from cpp/test/linalg/randomized_svd.cu
rename to cpp/tests/linalg/randomized_svd.cu
diff --git a/cpp/test/linalg/reduce.cu b/cpp/tests/linalg/reduce.cu
similarity index 100%
rename from cpp/test/linalg/reduce.cu
rename to cpp/tests/linalg/reduce.cu
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/tests/linalg/reduce.cuh
similarity index 100%
rename from cpp/test/linalg/reduce.cuh
rename to cpp/tests/linalg/reduce.cuh
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/tests/linalg/reduce_cols_by_key.cu
similarity index 100%
rename from cpp/test/linalg/reduce_cols_by_key.cu
rename to cpp/tests/linalg/reduce_cols_by_key.cu
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/tests/linalg/reduce_rows_by_key.cu
similarity index 100%
rename from cpp/test/linalg/reduce_rows_by_key.cu
rename to cpp/tests/linalg/reduce_rows_by_key.cu
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/tests/linalg/rsvd.cu
similarity index 100%
rename from cpp/test/linalg/rsvd.cu
rename to cpp/tests/linalg/rsvd.cu
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/tests/linalg/sqrt.cu
similarity index 100%
rename from cpp/test/linalg/sqrt.cu
rename to cpp/tests/linalg/sqrt.cu
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/tests/linalg/strided_reduction.cu
similarity index 100%
rename from cpp/test/linalg/strided_reduction.cu
rename to cpp/tests/linalg/strided_reduction.cu
diff --git a/cpp/test/linalg/subtract.cu b/cpp/tests/linalg/subtract.cu
similarity index 100%
rename from cpp/test/linalg/subtract.cu
rename to cpp/tests/linalg/subtract.cu
diff --git a/cpp/test/linalg/svd.cu b/cpp/tests/linalg/svd.cu
similarity index 100%
rename from cpp/test/linalg/svd.cu
rename to cpp/tests/linalg/svd.cu
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/tests/linalg/ternary_op.cu
similarity index 100%
rename from cpp/test/linalg/ternary_op.cu
rename to cpp/tests/linalg/ternary_op.cu
diff --git a/cpp/test/linalg/transpose.cu b/cpp/tests/linalg/transpose.cu
similarity index 100%
rename from cpp/test/linalg/transpose.cu
rename to cpp/tests/linalg/transpose.cu
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/tests/linalg/unary_op.cu
similarity index 100%
rename from cpp/test/linalg/unary_op.cu
rename to cpp/tests/linalg/unary_op.cu
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/tests/linalg/unary_op.cuh
similarity index 100%
rename from cpp/test/linalg/unary_op.cuh
rename to cpp/tests/linalg/unary_op.cuh
diff --git a/cpp/test/matrix/argmax.cu b/cpp/tests/matrix/argmax.cu
similarity index 99%
rename from cpp/test/matrix/argmax.cu
rename to cpp/tests/matrix/argmax.cu
index cb3fd4a3fb..c0cf85cd38 100644
--- a/cpp/test/matrix/argmax.cu
+++ b/cpp/tests/matrix/argmax.cu
@@ -110,4 +110,4 @@ INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestF, ::testing::ValuesIn(inputsf));
 INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/test/matrix/argmin.cu b/cpp/tests/matrix/argmin.cu
similarity index 99%
rename from cpp/test/matrix/argmin.cu
rename to cpp/tests/matrix/argmin.cu
index 060b4a78db..f0cacacf3a 100644
--- a/cpp/test/matrix/argmin.cu
+++ b/cpp/tests/matrix/argmin.cu
@@ -110,4 +110,4 @@ INSTANTIATE_TEST_SUITE_P(ArgMinTest, ArgMinTestF, ::testing::ValuesIn(inputsf));
 INSTANTIATE_TEST_SUITE_P(ArgMinTest, ArgMinTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/tests/matrix/columnSort.cu
similarity index 100%
rename from cpp/test/matrix/columnSort.cu
rename to cpp/tests/matrix/columnSort.cu
diff --git a/cpp/test/matrix/diagonal.cu b/cpp/tests/matrix/diagonal.cu
similarity index 99%
rename from cpp/test/matrix/diagonal.cu
rename to cpp/tests/matrix/diagonal.cu
index c6e1f1a0d2..0a1f2af825 100644
--- a/cpp/test/matrix/diagonal.cu
+++ b/cpp/tests/matrix/diagonal.cu
@@ -116,4 +116,4 @@ INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestF, ::testing::ValuesIn(inputs
 INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/test/matrix/eye.cu b/cpp/tests/matrix/eye.cu
similarity index 100%
rename from cpp/test/matrix/eye.cu
rename to cpp/tests/matrix/eye.cu
diff --git a/cpp/test/matrix/gather.cu b/cpp/tests/matrix/gather.cu
similarity index 99%
rename from cpp/test/matrix/gather.cu
rename to cpp/tests/matrix/gather.cu
index 4c13d0c1e9..f62805b2b8 100644
--- a/cpp/test/matrix/gather.cu
+++ b/cpp/tests/matrix/gather.cu
@@ -246,4 +246,4 @@ GATHER_TEST((GatherTest<false, false, true, float, uint32_t, int64_t>),
 GATHER_TEST((GatherTest<false, false, true, float, int64_t, int64_t>),
             GatherInplaceTestFI64I64,
             inplace_inputs_i64);
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/tests/matrix/linewise_op.cu
similarity index 100%
rename from cpp/test/matrix/linewise_op.cu
rename to cpp/tests/matrix/linewise_op.cu
diff --git a/cpp/test/matrix/math.cu b/cpp/tests/matrix/math.cu
similarity index 100%
rename from cpp/test/matrix/math.cu
rename to cpp/tests/matrix/math.cu
diff --git a/cpp/test/matrix/matrix.cu b/cpp/tests/matrix/matrix.cu
similarity index 100%
rename from cpp/test/matrix/matrix.cu
rename to cpp/tests/matrix/matrix.cu
diff --git a/cpp/test/matrix/norm.cu b/cpp/tests/matrix/norm.cu
similarity index 100%
rename from cpp/test/matrix/norm.cu
rename to cpp/tests/matrix/norm.cu
diff --git a/cpp/test/matrix/reverse.cu b/cpp/tests/matrix/reverse.cu
similarity index 100%
rename from cpp/test/matrix/reverse.cu
rename to cpp/tests/matrix/reverse.cu
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/tests/matrix/sample_rows.cu
similarity index 100%
rename from cpp/test/matrix/sample_rows.cu
rename to cpp/tests/matrix/sample_rows.cu
diff --git a/cpp/test/matrix/scatter.cu b/cpp/tests/matrix/scatter.cu
similarity index 99%
rename from cpp/test/matrix/scatter.cu
rename to cpp/tests/matrix/scatter.cu
index 7f478c7b93..f539b9759a 100644
--- a/cpp/test/matrix/scatter.cu
+++ b/cpp/tests/matrix/scatter.cu
@@ -140,4 +140,4 @@ const std::vector<ScatterInputs<int64_t>> inputs_i64 =
 
 SCATTER_TEST((ScatterTest<float, int>), ScatterTestFI32, inputs_i32);
 SCATTER_TEST((ScatterTest<float, int64_t>), ScatterTestFI64, inputs_i64);
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/test/matrix/select_k.cu b/cpp/tests/matrix/select_k.cu
similarity index 100%
rename from cpp/test/matrix/select_k.cu
rename to cpp/tests/matrix/select_k.cu
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/tests/matrix/select_k.cuh
similarity index 100%
rename from cpp/test/matrix/select_k.cuh
rename to cpp/tests/matrix/select_k.cuh
diff --git a/cpp/test/matrix/select_large_k.cu b/cpp/tests/matrix/select_large_k.cu
similarity index 100%
rename from cpp/test/matrix/select_large_k.cu
rename to cpp/tests/matrix/select_large_k.cu
diff --git a/cpp/test/matrix/slice.cu b/cpp/tests/matrix/slice.cu
similarity index 100%
rename from cpp/test/matrix/slice.cu
rename to cpp/tests/matrix/slice.cu
diff --git a/cpp/test/matrix/triangular.cu b/cpp/tests/matrix/triangular.cu
similarity index 100%
rename from cpp/test/matrix/triangular.cu
rename to cpp/tests/matrix/triangular.cu
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/tests/mr/device/buffer.cpp
similarity index 99%
rename from cpp/test/mr/device/buffer.cpp
rename to cpp/tests/mr/device/buffer.cpp
index d14aa09b7a..3d5652a591 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/tests/mr/device/buffer.cpp
@@ -92,4 +92,4 @@ TEST(Raft, DeviceBufferZeroResize)
 
 }  // namespace device
 }  // namespace mr
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/tests/mr/host/buffer.cpp
similarity index 99%
rename from cpp/test/mr/host/buffer.cpp
rename to cpp/tests/mr/host/buffer.cpp
index 5688ff6376..792160eb89 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/tests/mr/host/buffer.cpp
@@ -69,4 +69,4 @@ TEST(Raft, DeviceToHostBuffer)
 
 }  // namespace host
 }  // namespace mr
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/tests/neighbors/ball_cover.cu
similarity index 100%
rename from cpp/test/neighbors/ball_cover.cu
rename to cpp/tests/neighbors/ball_cover.cu
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/tests/neighbors/epsilon_neighborhood.cu
similarity index 100%
rename from cpp/test/neighbors/epsilon_neighborhood.cu
rename to cpp/tests/neighbors/epsilon_neighborhood.cu
diff --git a/cpp/test/neighbors/haversine.cu b/cpp/tests/neighbors/haversine.cu
similarity index 100%
rename from cpp/test/neighbors/haversine.cu
rename to cpp/tests/neighbors/haversine.cu
diff --git a/cpp/test/neighbors/knn_utils.cuh b/cpp/tests/neighbors/knn_utils.cuh
similarity index 100%
rename from cpp/test/neighbors/knn_utils.cuh
rename to cpp/tests/neighbors/knn_utils.cuh
diff --git a/cpp/test/neighbors/spatial_data.h b/cpp/tests/neighbors/spatial_data.h
similarity index 98%
rename from cpp/test/neighbors/spatial_data.h
rename to cpp/tests/neighbors/spatial_data.h
index d71b47cf1e..b4352f706d 100644
--- a/cpp/test/neighbors/spatial_data.h
+++ b/cpp/tests/neighbors/spatial_data.h
@@ -35,4 +35,4 @@ std::vector<float> spatial_data = {
   31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
   47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/test/random/excess_sampling.cu b/cpp/tests/random/excess_sampling.cu
similarity index 100%
rename from cpp/test/random/excess_sampling.cu
rename to cpp/tests/random/excess_sampling.cu
diff --git a/cpp/test/random/make_blobs.cu b/cpp/tests/random/make_blobs.cu
similarity index 100%
rename from cpp/test/random/make_blobs.cu
rename to cpp/tests/random/make_blobs.cu
diff --git a/cpp/test/random/make_regression.cu b/cpp/tests/random/make_regression.cu
similarity index 100%
rename from cpp/test/random/make_regression.cu
rename to cpp/tests/random/make_regression.cu
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/tests/random/multi_variable_gaussian.cu
similarity index 100%
rename from cpp/test/random/multi_variable_gaussian.cu
rename to cpp/tests/random/multi_variable_gaussian.cu
diff --git a/cpp/test/random/permute.cu b/cpp/tests/random/permute.cu
similarity index 100%
rename from cpp/test/random/permute.cu
rename to cpp/tests/random/permute.cu
diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/tests/random/rmat_rectangular_generator.cu
similarity index 79%
rename from cpp/test/random/rmat_rectangular_generator.cu
rename to cpp/tests/random/rmat_rectangular_generator.cu
index 8d668f7a8a..10c00051b6 100644
--- a/cpp/test/random/rmat_rectangular_generator.cu
+++ b/cpp/tests/random/rmat_rectangular_generator.cu
@@ -155,10 +155,10 @@ RAFT_KERNEL compute_hist(
   size_t idx = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
   if (idx + 1 < len) {
     auto src = out[idx], dst = out[idx + 1];
-    for (size_t j = 0; j < max_scale; ++j) {
-      bool src_bit = j < r_scale ? src & (1 << (r_scale - j - 1)) : 0;
-      bool dst_bit = j < c_scale ? dst & (1 << (c_scale - j - 1)) : 0;
-      auto idx     = j * 4 + src_bit * 2 + dst_bit;
+    for (size_t bit_pos = 0; bit_pos < max_scale; ++bit_pos) {
+      bool src_bit = bit_pos < r_scale ? src & (1 << bit_pos) : 0;
+      bool dst_bit = bit_pos < c_scale ? dst & (1 << bit_pos) : 0;
+      auto idx     = bit_pos * 4 + src_bit * 2 + dst_bit;
       atomicAdd(hist + idx, 1);
     }
   }
@@ -393,11 +393,101 @@ const std::vector<RmatInputs> inputs = {
   {18, 16, 200000, false, 456789ULL, TOLERANCE},
   {18, 16, 200000, true, 456789ULL, TOLERANCE}};
 
+struct RmatForcedOutputs {
+  size_t r_scale;
+  size_t c_scale;
+  size_t r_node_id;
+  size_t c_node_id;
+};
+
+class RmatGenForceTest : public ::testing::TestWithParam<RmatForcedOutputs> {
+ public:
+  RmatGenForceTest()
+    : handle{},
+      stream{resource::get_cuda_stream(handle)},
+      params{::testing::TestWithParam<RmatForcedOutputs>::GetParam()},
+      out{2, stream},
+      out_src{1, stream},
+      out_dst{1, stream},
+      theta{0, stream},
+      h_theta{},
+      state{0, GeneratorType::GenPC},
+      max_scale(std::max(params.r_scale, params.c_scale))
+  {
+    theta.resize(4 * max_scale, stream);
+    h_theta.resize(theta.size(), 0.f);
+    for (size_t bit_pos = 0; bit_pos < max_scale; ++bit_pos) {
+      size_t row_bit = ((params.r_node_id & (1 << bit_pos)) != 0);
+      size_t col_bit = ((params.c_node_id & (1 << bit_pos)) != 0);
+
+      // now force theta for bit -- 2x2 matrix row major
+      h_theta[4 * bit_pos + row_bit * 2 + col_bit] = 1.f;
+    }
+
+    raft::update_device(theta.data(), h_theta.data(), max_scale * 4, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  void SetUp() override
+  {
+    rmat_rectangular_gen(out.data(),
+                         out_src.data(),
+                         out_dst.data(),
+                         theta.data(),
+                         params.r_scale,
+                         params.c_scale,
+                         size_t(1),
+                         stream,
+                         state);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  void validate()
+  {
+    std::vector<size_t> h_out(2, size_t(0));
+    raft::update_host(h_out.data(), out.data(), 2, stream);
+    RAFT_CUDA_TRY(cudaGetLastError());
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+    std::vector<size_t> h_out_expect;
+    h_out_expect.push_back(params.r_node_id);
+    h_out_expect.push_back(params.c_node_id);
+
+    ASSERT_TRUE(hostVecMatch(h_out_expect, h_out, raft::Compare<size_t>()));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  RmatForcedOutputs params;
+  size_t max_scale;
+  std::vector<float> h_theta;
+  rmm::device_uvector<size_t> out, out_src, out_dst;
+  rmm::device_uvector<float> theta;
+  RngState state;
+};
+
+const std::vector<RmatForcedOutputs> forcedInputs = {{16, 16, 12425, 1233},
+                                                     {16, 16, 12, 424},
+                                                     {5, 5, 15, 15},
+                                                     {5, 6, 15, 15},
+                                                     {5, 15, 15, 15},
+                                                     {6, 5, 15, 15},
+                                                     {15, 5, 15, 15},
+                                                     {32, 16, 1253163, 60000},
+                                                     {16, 16, 12, 0},
+                                                     {16, 16, 0, 1255}};
+
 TEST_P(RmatGenTest, Result) { validate(); }
 INSTANTIATE_TEST_SUITE_P(RmatGenTests, RmatGenTest, ::testing::ValuesIn(inputs));
 
 TEST_P(RmatGenMdspanTest, Result) { validate(); }
 INSTANTIATE_TEST_SUITE_P(RmatGenMdspanTests, RmatGenMdspanTest, ::testing::ValuesIn(inputs));
 
+TEST_P(RmatGenForceTest, Result) { validate(); }
+INSTANTIATE_TEST_SUITE_P(RmatGenForceTests, RmatGenForceTest, ::testing::ValuesIn(forcedInputs));
+
 }  // namespace random
 }  // namespace raft
diff --git a/cpp/test/random/rng.cu b/cpp/tests/random/rng.cu
similarity index 99%
rename from cpp/test/random/rng.cu
rename to cpp/tests/random/rng.cu
index a37f150d4c..172f94ae50 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/tests/random/rng.cu
@@ -407,8 +407,7 @@ TEST(Rng, MeanError)
     RngState r(seed, rtype);
     normal(handle, r, data.data(), len, 3.3f, 0.23f);
     // uniform(r, data, len, -1.0, 2.0);
-    raft::stats::mean(
-      mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream);
+    raft::stats::mean(mean_result.data(), data.data(), num_samples, num_experiments, false, stream);
     raft::stats::stddev(std_result.data(),
                         data.data(),
                         mean_result.data(),
diff --git a/cpp/test/random/rng_discrete.cu b/cpp/tests/random/rng_discrete.cu
similarity index 100%
rename from cpp/test/random/rng_discrete.cu
rename to cpp/tests/random/rng_discrete.cu
diff --git a/cpp/test/random/rng_int.cu b/cpp/tests/random/rng_int.cu
similarity index 100%
rename from cpp/test/random/rng_int.cu
rename to cpp/tests/random/rng_int.cu
diff --git a/cpp/test/random/rng_pcg_host_api.cu b/cpp/tests/random/rng_pcg_host_api.cu
similarity index 100%
rename from cpp/test/random/rng_pcg_host_api.cu
rename to cpp/tests/random/rng_pcg_host_api.cu
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/tests/random/sample_without_replacement.cu
similarity index 100%
rename from cpp/test/random/sample_without_replacement.cu
rename to cpp/tests/random/sample_without_replacement.cu
diff --git a/cpp/test/sparse/add.cu b/cpp/tests/sparse/add.cu
similarity index 100%
rename from cpp/test/sparse/add.cu
rename to cpp/tests/sparse/add.cu
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/tests/sparse/convert_coo.cu
similarity index 100%
rename from cpp/test/sparse/convert_coo.cu
rename to cpp/tests/sparse/convert_coo.cu
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/tests/sparse/convert_csr.cu
similarity index 57%
rename from cpp/test/sparse/convert_csr.cu
rename to cpp/tests/sparse/convert_csr.cu
index 1cd49b0bbd..d74296a267 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/tests/sparse/convert_csr.cu
@@ -17,6 +17,7 @@
 #include "../test_utils.cuh"
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/bitset.cuh>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -249,7 +250,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
   index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
   {
     index_t total    = static_cast<index_t>(m * n);
-    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * (1.0f - sparsity));
     index_t res      = num_ones;
 
     for (auto& item : bitmap) {
@@ -257,7 +258,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
     }
 
     std::random_device rd;
-    std::mt19937 gen(rd());
+    std::mt19937 gen(random_number = rd());
     std::uniform_int_distribution<index_t> dis(0, total - 1);
 
     while (num_ones > 0) {
@@ -318,8 +319,8 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
       size_t start_idx = row_ptrs1[i];
       size_t end_idx   = row_ptrs1[i + 1];
 
-      std::vector<int> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
-      std::vector<int> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
+      std::vector<index_t> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
+      std::vector<index_t> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
 
       std::sort(cols1.begin(), cols1.end());
       std::sort(cols2.begin(), cols2.end());
@@ -370,7 +371,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
         raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_rows, params.n_cols, nnz);
       auto csr_view = csr.structure_view();
 
-      convert::bitmap_to_csr(handle, bitmap, csr);
+      bitmap.to_csr(handle, csr);
       raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
       raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
       raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
@@ -379,7 +380,7 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
         indptr_d.data(), indices_d.data(), params.n_rows, params.n_cols, nnz);
       auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
 
-      convert::bitmap_to_csr(handle, bitmap, csr);
+      bitmap.to_csr(handle, csr);
       raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
     }
     resource::sync_stream(handle);
@@ -396,9 +397,13 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
 
     resource::sync_stream(handle);
 
-    ASSERT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h));
-    ASSERT_TRUE(raft::devArrMatch<value_t>(
-      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream));
+    EXPECT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h))
+      << " n_row: " << params.n_rows << ", n_cols: " << params.n_cols << ", nnz: " << nnz
+      << ", random_number: " << random_number;
+    EXPECT_TRUE(raft::devArrMatch<value_t>(
+      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream))
+      << " n_row: " << params.n_rows << ", n_cols: " << params.n_cols << ", nnz: " << nnz
+      << ", random_number: " << random_number;
   }
 
  protected:
@@ -418,6 +423,8 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
   rmm::device_uvector<index_t> indptr_expected_d;
   rmm::device_uvector<index_t> indices_expected_d;
   rmm::device_uvector<float> values_expected_d;
+
+  unsigned int random_number;
 };
 
 using BitmapToCSRTestI = BitmapToCSRTest<uint32_t, int, float>;
@@ -426,8 +433,295 @@ TEST_P(BitmapToCSRTestI, Result) { Run(); }
 using BitmapToCSRTestL = BitmapToCSRTest<uint32_t, int64_t, float>;
 TEST_P(BitmapToCSRTestL, Result) { Run(); }
 
+using BitmapToCSRTestLOnLargeSize = BitmapToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitmapToCSRTestLOnLargeSize, Result) { Run(); }
+
 template <typename index_t>
 const std::vector<BitmapToCSRInputs<index_t>> bitmaptocsr_inputs = {
+  {0, 0, 0.8, false},
+  {10, 32, 0.6, false},
+  {10, 3, 0.8, false},
+  {32, 1024, 0.6, false},
+  {1024, 1048576, 0.99, false},
+  {1024, 1024, 0.6, false},
+  {64 * 1024 + 10, 2, 0.7, false},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.7, false},             // No peeling-remainder
+  {17, 16, 0.7, false},             // Check peeling-remainder
+  {18, 16, 0.7, false},             // Check peeling-remainder
+  {32 + 9, 33, 0.8, false},         // Check peeling-remainder
+  {2, 33, 0.8, false},              // Check peeling-remainder
+  {0, 0, 0.8, true},
+  {10, 32, 0.6, true},
+  {10, 3, 0.8, true},
+  {32, 1024, 0.6, true},
+  {1024, 1048576, 0.99, true},
+  {1024, 1024, 0.6, true},
+  {64 * 1024 + 10, 2, 0.7, true},  // 64K + 10 is slightly over maximum of blockDim.y
+  {16, 16, 0.7, true},             // No peeling-remainder
+  {17, 16, 0.7, true},             // Check peeling-remainder
+  {18, 16, 0.7, true},             // Check peeling-remainder
+  {32 + 9, 33, 0.8, true},         // Check peeling-remainder
+  {2, 33, 0.8, true},              // Check peeling-remainder
+};
+
+template <typename index_t>
+const std::vector<BitmapToCSRInputs<index_t>> bitmaptocsr_large_inputs = {
+  {100, 100000000, 0.99, true}, {100, 100000000, 0.95, false}, {100, 100000000 + 17, 0.95, false}};
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitmapToCSRTestI,
+                        ::testing::ValuesIn(bitmaptocsr_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitmapToCSRTestL,
+                        ::testing::ValuesIn(bitmaptocsr_inputs<int64_t>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitmapToCSRTestLOnLargeSize,
+                        ::testing::ValuesIn(bitmaptocsr_large_inputs<int64_t>));
+
+/******************************** bitset to csr ********************************/
+
+template <typename index_t>
+struct BitsetToCSRInputs {
+  index_t n_repeat;
+  index_t n_cols;
+  float sparsity;
+  bool owning;
+};
+
+template <typename bitset_t, typename index_t, typename value_t>
+class BitsetToCSRTest : public ::testing::TestWithParam<BitsetToCSRInputs<index_t>> {
+ public:
+  BitsetToCSRTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<BitsetToCSRInputs<index_t>>::GetParam()),
+      bitset_d(0, stream),
+      indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      indptr_expected_d(0, stream),
+      indices_expected_d(0, stream),
+      values_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  void repeat_cpu_bitset(std::vector<bitset_t>& input,
+                         size_t input_bits,
+                         size_t repeat,
+                         std::vector<bitset_t>& output)
+  {
+    const size_t output_bits  = input_bits * repeat;
+    const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+
+    std::memset(output.data(), 0, output_units * sizeof(bitset_t));
+
+    size_t output_bit_index = 0;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bitset_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bitset_t) * 8);
+        bool bit                = (input[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8);
+
+        output[output_unit_index] |= (static_cast<bitset_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitset_t>& bitset)
+  {
+    index_t total    = static_cast<index_t>(m * n);
+    index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
+    index_t res      = num_ones;
+
+    for (auto& item : bitset) {
+      item = static_cast<bitset_t>(0);
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<index_t> dis(0, total - 1);
+
+    while (num_ones > 0) {
+      index_t index = dis(gen);
+
+      bitset_t& element    = bitset[index / (8 * sizeof(bitset_t))];
+      index_t bit_position = index % (8 * sizeof(bitset_t));
+
+      if (((element >> bit_position) & 1) == 0) {
+        element |= (static_cast<index_t>(1) << bit_position);
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void cpu_convert_to_csr(std::vector<bitset_t>& bitset,
+                          index_t rows,
+                          index_t cols,
+                          std::vector<index_t>& indices,
+                          std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    index_t index        = 0;
+    bitset_t element     = 0;
+    index_t bit_position = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        index        = i * cols + j;
+        element      = bitset[index / (8 * sizeof(bitset_t))];
+        bit_position = index % (8 * sizeof(bitset_t));
+
+        if (((element >> bit_position) & 1)) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  bool csr_compare(const std::vector<index_t>& row_ptrs1,
+                   const std::vector<index_t>& col_indices1,
+                   const std::vector<index_t>& row_ptrs2,
+                   const std::vector<index_t>& col_indices2)
+  {
+    if (row_ptrs1.size() != row_ptrs2.size()) { return false; }
+
+    if (col_indices1.size() != col_indices2.size()) { return false; }
+
+    if (!std::equal(row_ptrs1.begin(), row_ptrs1.end(), row_ptrs2.begin())) { return false; }
+
+    for (size_t i = 0; i < row_ptrs1.size() - 1; ++i) {
+      size_t start_idx = row_ptrs1[i];
+      size_t end_idx   = row_ptrs1[i + 1];
+
+      std::vector<index_t> cols1(col_indices1.begin() + start_idx, col_indices1.begin() + end_idx);
+      std::vector<index_t> cols2(col_indices2.begin() + start_idx, col_indices2.begin() + end_idx);
+
+      std::sort(cols1.begin(), cols1.end());
+      std::sort(cols2.begin(), cols2.end());
+
+      if (cols1 != cols2) { return false; }
+    }
+
+    return true;
+  }
+
+  void SetUp() override
+  {
+    index_t element = raft::ceildiv(1 * params.n_cols, index_t(sizeof(bitset_t) * 8));
+    std::vector<bitset_t> bitset_h(element);
+    std::vector<bitset_t> bitset_repeat_h(element * params.n_repeat);
+
+    nnz = create_sparse_matrix(1, params.n_cols, params.sparsity, bitset_h);
+
+    repeat_cpu_bitset(bitset_h, size_t(params.n_cols), size_t(params.n_repeat), bitset_repeat_h);
+    nnz *= params.n_repeat;
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_repeat + 1);
+
+    cpu_convert_to_csr(bitset_repeat_h, params.n_repeat, params.n_cols, indices_h, indptr_h);
+
+    bitset_d.resize(bitset_h.size(), stream);
+    indptr_d.resize(params.n_repeat + 1, stream);
+    indices_d.resize(nnz, stream);
+
+    indptr_expected_d.resize(params.n_repeat + 1, stream);
+    indices_expected_d.resize(nnz, stream);
+    values_expected_d.resize(nnz, stream);
+
+    thrust::fill_n(resource::get_thrust_policy(handle), values_expected_d.data(), nnz, value_t{1});
+
+    values_d.resize(nnz, stream);
+
+    update_device(indices_expected_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_expected_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(bitset_d.data(), bitset_h.data(), bitset_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), params.n_cols);
+
+    if (params.owning) {
+      auto csr =
+        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_repeat, params.n_cols, nnz);
+      auto csr_view = csr.structure_view();
+
+      bitset.to_csr(handle, csr);
+      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+    } else {
+      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+        indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
+
+      bitset.to_csr(handle, csr);
+      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+    }
+    resource::sync_stream(handle);
+
+    std::vector<index_t> indices_h(indices_expected_d.size(), 0);
+    std::vector<index_t> indices_expected_h(indices_expected_d.size(), 0);
+    update_host(indices_h.data(), indices_d.data(), indices_h.size(), stream);
+    update_host(indices_expected_h.data(), indices_expected_d.data(), indices_h.size(), stream);
+
+    std::vector<index_t> indptr_h(indptr_expected_d.size(), 0);
+    std::vector<index_t> indptr_expected_h(indptr_expected_d.size(), 0);
+    update_host(indptr_h.data(), indptr_d.data(), indptr_h.size(), stream);
+    update_host(indptr_expected_h.data(), indptr_expected_d.data(), indptr_h.size(), stream);
+
+    resource::sync_stream(handle);
+
+    ASSERT_TRUE(csr_compare(indptr_h, indices_h, indptr_expected_h, indices_expected_h));
+    ASSERT_TRUE(raft::devArrMatch<value_t>(
+      values_expected_d.data(), values_d.data(), nnz, raft::Compare<value_t>(), stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  BitsetToCSRInputs<index_t> params;
+
+  rmm::device_uvector<bitset_t> bitset_d;
+
+  index_t nnz;
+
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<float> values_d;
+
+  rmm::device_uvector<index_t> indptr_expected_d;
+  rmm::device_uvector<index_t> indices_expected_d;
+  rmm::device_uvector<float> values_expected_d;
+};
+
+using BitsetToCSRTestI = BitsetToCSRTest<uint32_t, int, float>;
+TEST_P(BitsetToCSRTestI, Result) { Run(); }
+
+using BitsetToCSRTestL = BitsetToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitsetToCSRTestL, Result) { Run(); }
+
+using BitsetToCSRTestLOnLargeSize = BitsetToCSRTest<uint32_t, int64_t, float>;
+TEST_P(BitsetToCSRTestLOnLargeSize, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<BitsetToCSRInputs<index_t>> bitsettocsr_inputs = {
   {0, 0, 0.2, false},
   {10, 32, 0.4, false},
   {10, 3, 0.2, false},
@@ -454,12 +748,19 @@ const std::vector<BitmapToCSRInputs<index_t>> bitmaptocsr_inputs = {
   {2, 33, 0.2, true},              // Check peeling-remainder
 };
 
+template <typename index_t>
+const std::vector<BitsetToCSRInputs<index_t>> bitsettocsr_large_inputs = {
+  {100, 100000000, 0.01, true}, {100, 100000000, 0.05, false}, {100, 100000000 + 17, 0.05, false}};
+
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
-                        BitmapToCSRTestI,
-                        ::testing::ValuesIn(bitmaptocsr_inputs<int>));
+                        BitsetToCSRTestI,
+                        ::testing::ValuesIn(bitsettocsr_inputs<int>));
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
-                        BitmapToCSRTestL,
-                        ::testing::ValuesIn(bitmaptocsr_inputs<int64_t>));
+                        BitsetToCSRTestL,
+                        ::testing::ValuesIn(bitsettocsr_inputs<int64_t>));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        BitsetToCSRTestLOnLargeSize,
+                        ::testing::ValuesIn(bitsettocsr_large_inputs<int64_t>));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/tests/sparse/csr_row_slice.cu
similarity index 100%
rename from cpp/test/sparse/csr_row_slice.cu
rename to cpp/tests/sparse/csr_row_slice.cu
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/tests/sparse/csr_to_dense.cu
similarity index 100%
rename from cpp/test/sparse/csr_to_dense.cu
rename to cpp/tests/sparse/csr_to_dense.cu
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/tests/sparse/csr_transpose.cu
similarity index 100%
rename from cpp/test/sparse/csr_transpose.cu
rename to cpp/tests/sparse/csr_transpose.cu
diff --git a/cpp/test/sparse/degree.cu b/cpp/tests/sparse/degree.cu
similarity index 100%
rename from cpp/test/sparse/degree.cu
rename to cpp/tests/sparse/degree.cu
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/tests/sparse/dist_coo_spmv.cu
similarity index 100%
rename from cpp/test/sparse/dist_coo_spmv.cu
rename to cpp/tests/sparse/dist_coo_spmv.cu
diff --git a/cpp/test/sparse/distance.cu b/cpp/tests/sparse/distance.cu
similarity index 100%
rename from cpp/test/sparse/distance.cu
rename to cpp/tests/sparse/distance.cu
diff --git a/cpp/test/sparse/filter.cu b/cpp/tests/sparse/filter.cu
similarity index 100%
rename from cpp/test/sparse/filter.cu
rename to cpp/tests/sparse/filter.cu
diff --git a/cpp/test/sparse/masked_matmul.cu b/cpp/tests/sparse/masked_matmul.cu
similarity index 75%
rename from cpp/test/sparse/masked_matmul.cu
rename to cpp/tests/sparse/masked_matmul.cu
index f883beae32..5ee1677015 100644
--- a/cpp/test/sparse/masked_matmul.cu
+++ b/cpp/tests/sparse/masked_matmul.cu
@@ -19,7 +19,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/make_blobs.cuh>
-#include <raft/sparse/linalg/masked_matmul.hpp>
+#include <raft/sparse/linalg/masked_matmul.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <thrust/reduce.h>
@@ -46,6 +46,8 @@ struct MaskedMatmulInputs {
   unsigned long long int seed;
 };
 
+enum class BitsLayout { Bitset, Bitmap };
+
 template <typename value_t>
 struct sum_abs_op {
   __host__ __device__ value_t operator()(const value_t& x, const value_t& y) const
@@ -87,7 +89,8 @@ bool isCuSparseVersionGreaterThan_12_0_1()
 template <typename value_t,
           typename output_t,
           typename index_t,
-          typename bitmap_t      = uint32_t,
+          BitsLayout bits_layout = BitsLayout::Bitmap,
+          typename bits_t        = uint32_t,
           typename LayoutPolicyA = raft::row_major,
           typename LayoutPolicyB = raft::row_major>
 class MaskedMatmulTest
@@ -98,7 +101,7 @@ class MaskedMatmulTest
       stream(resource::get_cuda_stream(handle)),
       a_data_d(0, resource::get_cuda_stream(handle)),
       b_data_d(0, resource::get_cuda_stream(handle)),
-      bitmap_d(0, resource::get_cuda_stream(handle)),
+      bits_d(0, resource::get_cuda_stream(handle)),
       c_indptr_d(0, resource::get_cuda_stream(handle)),
       c_indices_d(0, resource::get_cuda_stream(handle)),
       c_data_d(0, resource::get_cuda_stream(handle)),
@@ -107,14 +110,14 @@ class MaskedMatmulTest
   }
 
  protected:
-  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bitmap_t>& bitmap)
+  index_t create_sparse_matrix(index_t m, index_t n, float sparsity, std::vector<bits_t>& bits)
   {
     index_t total    = static_cast<index_t>(m * n);
     index_t num_ones = static_cast<index_t>((total * 1.0f) * sparsity);
     index_t res      = num_ones;
 
-    for (auto& item : bitmap) {
-      item = static_cast<bitmap_t>(0);
+    for (auto& item : bits) {
+      item = static_cast<bits_t>(0);
     }
 
     std::random_device rd;
@@ -124,8 +127,8 @@ class MaskedMatmulTest
     while (num_ones > 0) {
       index_t index = dis(gen);
 
-      bitmap_t& element    = bitmap[index / (8 * sizeof(bitmap_t))];
-      index_t bit_position = index % (8 * sizeof(bitmap_t));
+      bits_t& element      = bits[index / (8 * sizeof(bits_t))];
+      index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
         element |= (static_cast<index_t>(1) << bit_position);
@@ -135,7 +138,27 @@ class MaskedMatmulTest
     return res;
   }
 
-  void cpu_convert_to_csr(std::vector<bitmap_t>& bitmap,
+  void repeat_cpu_bitset_inplace(std::vector<bits_t>& inout, size_t input_bits, size_t repeat)
+  {
+    size_t output_bit_index = input_bits;
+
+    for (size_t r = 0; r < repeat; ++r) {
+      for (size_t i = 0; i < input_bits; ++i) {
+        size_t input_unit_index = i / (sizeof(bits_t) * 8);
+        size_t input_bit_offset = i % (sizeof(bits_t) * 8);
+        bool bit                = (inout[input_unit_index] >> input_bit_offset) & 1;
+
+        size_t output_unit_index = output_bit_index / (sizeof(bits_t) * 8);
+        size_t output_bit_offset = output_bit_index % (sizeof(bits_t) * 8);
+
+        inout[output_unit_index] |= (static_cast<bits_t>(bit) << output_bit_offset);
+
+        ++output_bit_index;
+      }
+    }
+  }
+
+  void cpu_convert_to_csr(std::vector<bits_t>& bits,
                           index_t rows,
                           index_t cols,
                           std::vector<index_t>& indices,
@@ -146,14 +169,14 @@ class MaskedMatmulTest
     indptr[offset_indptr++] = 0;
 
     index_t index        = 0;
-    bitmap_t element     = 0;
+    bits_t element       = 0;
     index_t bit_position = 0;
 
     for (index_t i = 0; i < rows; ++i) {
       for (index_t j = 0; j < cols; ++j) {
         index        = i * cols + j;
-        element      = bitmap[index / (8 * sizeof(bitmap_t))];
-        bit_position = index % (8 * sizeof(bitmap_t));
+        element      = bits[index / (8 * sizeof(bits_t))];
+        bit_position = index % (8 * sizeof(bits_t));
 
         if (((element >> bit_position) & 1)) {
           indices[offset_values] = static_cast<index_t>(j);
@@ -201,15 +224,17 @@ class MaskedMatmulTest
     index_t b_size = params.k * params.n;
     index_t c_size = params.m * params.n;
 
-    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bitmap_t) * 8));
-    std::vector<bitmap_t> bitmap_h(element);
+    index_t element = raft::ceildiv(params.m * params.n, index_t(sizeof(bits_t) * 8));
+    std::vector<bits_t> bits_h(element);
+
+    std::memset(bits_h.data(), 0, bits_h.size() * sizeof(bits_t));
 
     std::vector<value_t> a_data_h(a_size);
     std::vector<value_t> b_data_h(b_size);
 
     a_data_d.resize(a_size, stream);
     b_data_d.resize(b_size, stream);
-    bitmap_d.resize(bitmap_h.size(), stream);
+    bits_d.resize(bits_h.size(), stream);
 
     auto blobs_a_b = raft::make_device_matrix<output_t, index_t>(handle, 1, a_size + b_size);
     auto labels    = raft::make_device_vector<index_t, index_t>(handle, 1);
@@ -262,18 +287,27 @@ class MaskedMatmulTest
 
     resource::sync_stream(handle);
 
-    index_t c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bitmap_h);
+    index_t c_true_nnz = 0;
+    if constexpr (bits_layout == BitsLayout::Bitmap) {
+      c_true_nnz = create_sparse_matrix(params.m, params.n, params.sparsity, bits_h);
+    } else if constexpr (bits_layout == BitsLayout::Bitset) {
+      c_true_nnz = create_sparse_matrix(1, params.n, params.sparsity, bits_h);
+      repeat_cpu_bitset_inplace(bits_h, params.n, params.m - 1);
+      c_true_nnz *= params.m;
+    } else {
+      GTEST_SKIP() << "Unsupported BitsLayout!";
+    }
 
     std::vector<index_t> c_indptr_h(params.m + 1);
     std::vector<index_t> c_indices_h(c_true_nnz);
     std::vector<output_t> c_data_h(c_true_nnz);
 
-    cpu_convert_to_csr(bitmap_h, params.m, params.n, c_indices_h, c_indptr_h);
+    cpu_convert_to_csr(bits_h, params.m, params.n, c_indices_h, c_indptr_h);
 
     c_data_d.resize(c_data_h.size(), stream);
 
     update_device(c_data_d.data(), c_data_h.data(), c_data_h.size(), stream);
-    update_device(bitmap_d.data(), bitmap_h.data(), bitmap_h.size(), stream);
+    update_device(bits_d.data(), bits_h.data(), bits_h.size(), stream);
     resource::sync_stream(handle);
 
     cpu_sddmm(a_data_h, b_data_h, c_data_h, c_indices_h, c_indptr_h, true, true);
@@ -304,9 +338,6 @@ class MaskedMatmulTest
     auto B =
       raft::make_device_matrix_view<const value_t, index_t>(b_data_d.data(), params.n, params.k);
 
-    auto mask =
-      raft::core::bitmap_view<const bitmap_t, index_t>(bitmap_d.data(), params.m, params.n);
-
     auto c_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
       c_indptr_d.data(),
       c_indices_d.data(),
@@ -316,7 +347,15 @@ class MaskedMatmulTest
 
     auto C = raft::make_device_csr_matrix_view<output_t>(c_data_d.data(), c_structure);
 
-    raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    if constexpr (bits_layout == BitsLayout::Bitmap) {
+      auto mask = raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
+      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    } else if constexpr (bits_layout == BitsLayout::Bitset) {
+      auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
+      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+    } else {
+      GTEST_SKIP() << "Unsupported BitsLayout!";
+    }
 
     resource::sync_stream(handle);
 
@@ -344,7 +383,7 @@ class MaskedMatmulTest
 
   rmm::device_uvector<value_t> a_data_d;
   rmm::device_uvector<value_t> b_data_d;
-  rmm::device_uvector<bitmap_t> bitmap_d;
+  rmm::device_uvector<bits_t> bits_d;
 
   rmm::device_uvector<index_t> c_indptr_d;
   rmm::device_uvector<index_t> c_indices_d;
@@ -353,14 +392,23 @@ class MaskedMatmulTest
   rmm::device_uvector<output_t> c_expected_data_d;
 };
 
-using MaskedMatmulTestF = MaskedMatmulTest<float, float, int>;
-TEST_P(MaskedMatmulTestF, Result) { Run(); }
+using MaskedMatmulOnBitmapTestF = MaskedMatmulTest<float, float, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestF, Result) { Run(); }
+
+using MaskedMatmulOnBitmapTestD = MaskedMatmulTest<double, double, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestD, Result) { Run(); }
 
-using MaskedMatmulTestD = MaskedMatmulTest<double, double, int>;
-TEST_P(MaskedMatmulTestD, Result) { Run(); }
+using MaskedMatmulOnBitmapTestH = MaskedMatmulTest<half, float, int, BitsLayout::Bitmap>;
+TEST_P(MaskedMatmulOnBitmapTestH, Result) { Run(); }
 
-using MaskedMatmulTestH = MaskedMatmulTest<half, float, int>;
-TEST_P(MaskedMatmulTestH, Result) { Run(); }
+using MaskedMatmulOnBitsetTestF = MaskedMatmulTest<float, float, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestF, Result) { Run(); }
+
+using MaskedMatmulOnBitsetTestD = MaskedMatmulTest<double, double, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestD, Result) { Run(); }
+
+using MaskedMatmulOnBitsetTestH = MaskedMatmulTest<half, float, int, BitsLayout::Bitset>;
+TEST_P(MaskedMatmulOnBitsetTestH, Result) { Run(); }
 
 const std::vector<MaskedMatmulInputs<float, float, int>> sddmm_inputs_f = {
   {0.001f, 2, 255, 1023, 0.19, 1234ULL},
@@ -419,11 +467,29 @@ const std::vector<MaskedMatmulInputs<half, float, int>> sddmm_inputs_h = {
   {0.0003f, 31, 1025, 1025, 0.19, 1234ULL},
   {0.001f, 1024, 1024, 1024, 0.1, 1234ULL}};
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestF, ::testing::ValuesIn(sddmm_inputs_f));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestF,
+                        ::testing::ValuesIn(sddmm_inputs_f));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestD,
+                        ::testing::ValuesIn(sddmm_inputs_d));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitmapTestH,
+                        ::testing::ValuesIn(sddmm_inputs_h));
+
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestF,
+                        ::testing::ValuesIn(sddmm_inputs_f));
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestD, ::testing::ValuesIn(sddmm_inputs_d));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestD,
+                        ::testing::ValuesIn(sddmm_inputs_d));
 
-INSTANTIATE_TEST_CASE_P(MaskedMatmulTest, MaskedMatmulTestH, ::testing::ValuesIn(sddmm_inputs_h));
+INSTANTIATE_TEST_CASE_P(MaskedMatmulTest,
+                        MaskedMatmulOnBitsetTestH,
+                        ::testing::ValuesIn(sddmm_inputs_h));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/mst.cu b/cpp/tests/sparse/mst.cu
similarity index 100%
rename from cpp/test/sparse/mst.cu
rename to cpp/tests/sparse/mst.cu
diff --git a/cpp/test/sparse/norm.cu b/cpp/tests/sparse/norm.cu
similarity index 100%
rename from cpp/test/sparse/norm.cu
rename to cpp/tests/sparse/norm.cu
diff --git a/cpp/test/sparse/normalize.cu b/cpp/tests/sparse/normalize.cu
similarity index 100%
rename from cpp/test/sparse/normalize.cu
rename to cpp/tests/sparse/normalize.cu
diff --git a/cpp/test/sparse/reduce.cu b/cpp/tests/sparse/reduce.cu
similarity index 100%
rename from cpp/test/sparse/reduce.cu
rename to cpp/tests/sparse/reduce.cu
diff --git a/cpp/test/sparse/row_op.cu b/cpp/tests/sparse/row_op.cu
similarity index 100%
rename from cpp/test/sparse/row_op.cu
rename to cpp/tests/sparse/row_op.cu
diff --git a/cpp/test/sparse/sddmm.cu b/cpp/tests/sparse/sddmm.cu
similarity index 100%
rename from cpp/test/sparse/sddmm.cu
rename to cpp/tests/sparse/sddmm.cu
diff --git a/cpp/test/sparse/select_k_csr.cu b/cpp/tests/sparse/select_k_csr.cu
similarity index 100%
rename from cpp/test/sparse/select_k_csr.cu
rename to cpp/tests/sparse/select_k_csr.cu
diff --git a/cpp/test/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
similarity index 100%
rename from cpp/test/sparse/solver/lanczos.cu
rename to cpp/tests/sparse/solver/lanczos.cu
diff --git a/cpp/test/sparse/sort.cu b/cpp/tests/sparse/sort.cu
similarity index 100%
rename from cpp/test/sparse/sort.cu
rename to cpp/tests/sparse/sort.cu
diff --git a/cpp/test/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
similarity index 100%
rename from cpp/test/sparse/spectral_matrix.cu
rename to cpp/tests/sparse/spectral_matrix.cu
diff --git a/cpp/test/sparse/spgemmi.cu b/cpp/tests/sparse/spgemmi.cu
similarity index 100%
rename from cpp/test/sparse/spgemmi.cu
rename to cpp/tests/sparse/spgemmi.cu
diff --git a/cpp/test/sparse/spmm.cu b/cpp/tests/sparse/spmm.cu
similarity index 100%
rename from cpp/test/sparse/spmm.cu
rename to cpp/tests/sparse/spmm.cu
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/tests/sparse/symmetrize.cu
similarity index 100%
rename from cpp/test/sparse/symmetrize.cu
rename to cpp/tests/sparse/symmetrize.cu
diff --git a/cpp/test/stats/accuracy.cu b/cpp/tests/stats/accuracy.cu
similarity index 100%
rename from cpp/test/stats/accuracy.cu
rename to cpp/tests/stats/accuracy.cu
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/tests/stats/adjusted_rand_index.cu
similarity index 100%
rename from cpp/test/stats/adjusted_rand_index.cu
rename to cpp/tests/stats/adjusted_rand_index.cu
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/tests/stats/completeness_score.cu
similarity index 100%
rename from cpp/test/stats/completeness_score.cu
rename to cpp/tests/stats/completeness_score.cu
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/tests/stats/contingencyMatrix.cu
similarity index 100%
rename from cpp/test/stats/contingencyMatrix.cu
rename to cpp/tests/stats/contingencyMatrix.cu
diff --git a/cpp/test/stats/cov.cu b/cpp/tests/stats/cov.cu
similarity index 99%
rename from cpp/test/stats/cov.cu
rename to cpp/tests/stats/cov.cu
index 602f356b9f..3f2a3dcebf 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/tests/stats/cov.cu
@@ -72,7 +72,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     cov_act.resize(cols * cols, stream);
 
     normal(handle, r, data.data(), len, params.mean, var);
-    raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream);
+    raft::stats::mean(mean_act.data(), data.data(), cols, rows, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       cov(handle,
@@ -102,7 +102,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     raft::update_device(data_cm.data(), data_h, 6, stream);
     raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
 
-    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream);
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, stream);
     cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
   }
 
diff --git a/cpp/test/stats/dispersion.cu b/cpp/tests/stats/dispersion.cu
similarity index 100%
rename from cpp/test/stats/dispersion.cu
rename to cpp/tests/stats/dispersion.cu
diff --git a/cpp/test/stats/entropy.cu b/cpp/tests/stats/entropy.cu
similarity index 100%
rename from cpp/test/stats/entropy.cu
rename to cpp/tests/stats/entropy.cu
diff --git a/cpp/test/stats/histogram.cu b/cpp/tests/stats/histogram.cu
similarity index 100%
rename from cpp/test/stats/histogram.cu
rename to cpp/tests/stats/histogram.cu
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/tests/stats/homogeneity_score.cu
similarity index 100%
rename from cpp/test/stats/homogeneity_score.cu
rename to cpp/tests/stats/homogeneity_score.cu
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/tests/stats/information_criterion.cu
similarity index 100%
rename from cpp/test/stats/information_criterion.cu
rename to cpp/tests/stats/information_criterion.cu
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/tests/stats/kl_divergence.cu
similarity index 100%
rename from cpp/test/stats/kl_divergence.cu
rename to cpp/tests/stats/kl_divergence.cu
diff --git a/cpp/test/stats/mean.cu b/cpp/tests/stats/mean.cu
similarity index 50%
rename from cpp/test/stats/mean.cu
rename to cpp/tests/stats/mean.cu
index c5fe83d95b..e72d4eaf74 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/tests/stats/mean.cu
@@ -33,7 +33,7 @@ template <typename T>
 struct MeanInputs {
   T tolerance, mean;
   int rows, cols;
-  bool sample, rowMajor;
+  bool rowMajor;
   unsigned long long int seed;
   T stddev = (T)1.0;
 };
@@ -42,7 +42,7 @@ template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
 {
   return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
-            << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
+            << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
 }
 
 template <typename T>
@@ -74,14 +74,12 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
       using layout = raft::row_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
     } else {
       using layout = raft::col_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
     }
   }
 
@@ -98,72 +96,51 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
 const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},
-  {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 130, false, false, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 130, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, true, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, false, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, true, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, false, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, true, 1234ULL},
-  {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f},
-  {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}};
-
-const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, false, 1234ULL},
-  {0.15, -1.0, 1030, 60, true, false, 1234ULL},
-  {2.0, -1.0, 31, 120, false, false, 1234ULL},
-  {2.0, -1.0, 1, 130, false, false, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 60, true, true, 1234ULL},
-  {2.0, -1.0, 31, 120, false, true, 1234ULL},
-  {2.0, -1.0, 1, 130, false, true, 1234ULL},
-  {2.0, -1.0, 1, 1, false, false, 1234ULL},
-  {2.0, -1.0, 1, 1, false, true, 1234ULL},
-  {2.0, -1.0, 7, 23, false, false, 1234ULL},
-  {2.0, -1.0, 7, 23, false, true, 1234ULL},
-  {2.0, -1.0, 17, 5, false, false, 1234ULL},
-  {2.0, -1.0, 17, 5, false, true, 1234ULL},
-  {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001},
-  {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}};
+  {0.15f, -1.f, 1024, 32, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, true, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, 1234ULL},
+  {0.15f, -1.f, 1030, 1, true, 1234ULL},
+  {2.0f, -1.f, 31, 120, true, 1234ULL},
+  {2.0f, -1.f, 1, 130, true, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, 1234ULL},
+  {2.0f, -1.f, 1, 1, true, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, 1234ULL},
+  {2.0f, -1.f, 7, 23, true, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, 1234ULL},
+  {2.0f, -1.f, 17, 5, true, 1234ULL},
+  {0.0001f, 0.1f, 1 << 27, 2, false, 1234ULL, 0.0001f},
+  {0.0001f, 0.1f, 1 << 27, 2, true, 1234ULL, 0.0001f}};
+
+const std::vector<MeanInputs<double>> inputsd = {{0.15, -1.0, 1024, 32, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, true, 1234ULL},
+                                                 {0.15, -1.0, 1030, 1, false, 1234ULL},
+                                                 {2.0, -1.0, 31, 120, false, 1234ULL},
+                                                 {2.0, -1.0, 1, 130, false, 1234ULL},
+                                                 {0.15, -1.0, 1030, 1, true, 1234ULL},
+                                                 {2.0, -1.0, 31, 120, true, 1234ULL},
+                                                 {2.0, -1.0, 1, 130, true, 1234ULL},
+                                                 {2.0, -1.0, 1, 1, false, 1234ULL},
+                                                 {2.0, -1.0, 1, 1, true, 1234ULL},
+                                                 {2.0, -1.0, 7, 23, false, 1234ULL},
+                                                 {2.0, -1.0, 7, 23, true, 1234ULL},
+                                                 {2.0, -1.0, 17, 5, false, 1234ULL},
+                                                 {2.0, -1.0, 17, 5, true, 1234ULL},
+                                                 {1e-8, 1e-1, 1 << 27, 2, false, 1234ULL, 0.0001},
+                                                 {1e-8, 1e-1, 1 << 27, 2, true, 1234ULL, 0.0001}};
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result)
diff --git a/cpp/tests/stats/mean_center.cu b/cpp/tests/stats/mean_center.cu
new file mode 100644
index 0000000000..48bf50056c
--- /dev/null
+++ b/cpp/tests/stats/mean_center.cu
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linalg/matrix_vector_op.cuh"
+#include "../test_utils.cuh"
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+namespace raft {
+namespace stats {
+
+template <typename T, typename IdxType>
+struct MeanCenterInputs {
+  T tolerance, mean;
+  IdxType rows, cols;
+  bool rowMajor, bcastAlongRows;
+  unsigned long long int seed;
+};
+
+template <typename T, typename IdxType>
+::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs<T, IdxType>& dims)
+{
+  return os;
+}
+
+template <typename T, typename IdxType>
+class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+ public:
+  MeanCenterTest()
+    : params(::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
+      stream(resource::get_cuda_stream(handle)),
+      rows(params.rows),
+      cols(params.cols),
+      out(rows * cols, stream),
+      out_ref(rows * cols, stream),
+      data(rows * cols, stream),
+      meanVec(params.bcastAlongRows ? cols : rows, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    auto len         = rows * cols;
+    auto meanVecSize = params.bcastAlongRows ? cols : rows;
+    normal(handle, r, data.data(), len, params.mean, (T)1.0);
+    raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.rowMajor, stream);
+    if (params.rowMajor) {
+      using layout = raft::row_major;
+      mean_center(handle,
+                  raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+                  raft::make_device_vector_view<const T, int>(meanVec.data(), meanVecSize),
+                  raft::make_device_matrix_view<T, int, layout>(out.data(), rows, cols),
+                  params.bcastAlongRows);
+    } else {
+      using layout = raft::col_major;
+      mean_center(handle,
+                  raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+                  raft::make_device_vector_view<const T, int>(meanVec.data(), meanVecSize),
+                  raft::make_device_matrix_view<T, int, layout>(out.data(), rows, cols),
+                  params.bcastAlongRows);
+    }
+    raft::linalg::naiveMatVec(out_ref.data(),
+                              data.data(),
+                              meanVec.data(),
+                              cols,
+                              rows,
+                              params.rowMajor,
+                              params.bcastAlongRows,
+                              (T)-1.0,
+                              stream);
+    resource::sync_stream(handle, stream);
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  MeanCenterInputs<T, IdxType> params;
+  int rows, cols;
+  rmm::device_uvector<T> data, meanVec, out, out_ref;
+};
+
+const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
+  {0.05f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, false, 1234ULL}};
+typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
+TEST_P(MeanCenterTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
+
+const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
+  {0.05f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, false, 1234ULL}};
+typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
+TEST_P(MeanCenterTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
+
+const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
+  {0.05, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, false, 1234ULL}};
+typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
+TEST_P(MeanCenterTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
+
+const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
+  {0.05, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, false, 1234ULL}};
+typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
+TEST_P(MeanCenterTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/meanvar.cu b/cpp/tests/stats/meanvar.cu
similarity index 100%
rename from cpp/test/stats/meanvar.cu
rename to cpp/tests/stats/meanvar.cu
diff --git a/cpp/test/stats/minmax.cu b/cpp/tests/stats/minmax.cu
similarity index 100%
rename from cpp/test/stats/minmax.cu
rename to cpp/tests/stats/minmax.cu
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/tests/stats/mutual_info_score.cu
similarity index 100%
rename from cpp/test/stats/mutual_info_score.cu
rename to cpp/tests/stats/mutual_info_score.cu
diff --git a/cpp/test/stats/r2_score.cu b/cpp/tests/stats/r2_score.cu
similarity index 100%
rename from cpp/test/stats/r2_score.cu
rename to cpp/tests/stats/r2_score.cu
diff --git a/cpp/test/stats/rand_index.cu b/cpp/tests/stats/rand_index.cu
similarity index 100%
rename from cpp/test/stats/rand_index.cu
rename to cpp/tests/stats/rand_index.cu
diff --git a/cpp/test/stats/regression_metrics.cu b/cpp/tests/stats/regression_metrics.cu
similarity index 100%
rename from cpp/test/stats/regression_metrics.cu
rename to cpp/tests/stats/regression_metrics.cu
diff --git a/cpp/test/stats/stddev.cu b/cpp/tests/stats/stddev.cu
similarity index 99%
rename from cpp/test/stats/stddev.cu
rename to cpp/tests/stats/stddev.cu
index f4c5f92f49..a9a70b1e60 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/tests/stats/stddev.cu
@@ -81,8 +81,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       using layout_t = raft::row_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           false);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
@@ -99,8 +98,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       using layout_t = raft::col_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T>(mean_act.data(), cols),
-           false);
+           raft::make_device_vector_view<T>(mean_act.data(), cols));
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
diff --git a/cpp/test/stats/sum.cu b/cpp/tests/stats/sum.cu
similarity index 100%
rename from cpp/test/stats/sum.cu
rename to cpp/tests/stats/sum.cu
diff --git a/cpp/test/stats/v_measure.cu b/cpp/tests/stats/v_measure.cu
similarity index 100%
rename from cpp/test/stats/v_measure.cu
rename to cpp/tests/stats/v_measure.cu
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/tests/stats/weighted_mean.cu
similarity index 99%
rename from cpp/test/stats/weighted_mean.cu
rename to cpp/tests/stats/weighted_mean.cu
index 407f3f14ea..e125fbc71e 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/tests/stats/weighted_mean.cu
@@ -340,4 +340,4 @@ TEST_P(WeightedMeanTestD, Result)
 INSTANTIATE_TEST_CASE_P(WeightedMeanTest, WeightedMeanTestD, ::testing::ValuesIn(inputsd));
 
 };  // end namespace stats
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/test/test.cpp b/cpp/tests/test.cpp
similarity index 100%
rename from cpp/test/test.cpp
rename to cpp/tests/test.cpp
diff --git a/cpp/test/test_utils.cuh b/cpp/tests/test_utils.cuh
similarity index 99%
rename from cpp/test/test_utils.cuh
rename to cpp/tests/test_utils.cuh
index 810a0d7985..ac4ed4d24e 100644
--- a/cpp/test/test_utils.cuh
+++ b/cpp/tests/test_utils.cuh
@@ -330,4 +330,4 @@ inline std::vector<float> read_csv(std::string filename, bool skip_first_n_colum
   return result;
 }
 
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/test/test_utils.h b/cpp/tests/test_utils.h
similarity index 100%
rename from cpp/test/test_utils.h
rename to cpp/tests/test_utils.h
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/tests/util/bitonic_sort.cu
similarity index 100%
rename from cpp/test/util/bitonic_sort.cu
rename to cpp/tests/util/bitonic_sort.cu
diff --git a/cpp/test/util/cudart_utils.cpp b/cpp/tests/util/cudart_utils.cpp
similarity index 100%
rename from cpp/test/util/cudart_utils.cpp
rename to cpp/tests/util/cudart_utils.cpp
diff --git a/cpp/test/util/device_atomics.cu b/cpp/tests/util/device_atomics.cu
similarity index 100%
rename from cpp/test/util/device_atomics.cu
rename to cpp/tests/util/device_atomics.cu
diff --git a/cpp/test/util/integer_utils.cpp b/cpp/tests/util/integer_utils.cpp
similarity index 100%
rename from cpp/test/util/integer_utils.cpp
rename to cpp/tests/util/integer_utils.cpp
diff --git a/cpp/test/util/integer_utils.cu b/cpp/tests/util/integer_utils.cu
similarity index 100%
rename from cpp/test/util/integer_utils.cu
rename to cpp/tests/util/integer_utils.cu
diff --git a/cpp/test/util/memory_type_dispatcher.cu b/cpp/tests/util/memory_type_dispatcher.cu
similarity index 100%
rename from cpp/test/util/memory_type_dispatcher.cu
rename to cpp/tests/util/memory_type_dispatcher.cu
diff --git a/cpp/test/util/popc.cu b/cpp/tests/util/popc.cu
similarity index 100%
rename from cpp/test/util/popc.cu
rename to cpp/tests/util/popc.cu
diff --git a/cpp/test/util/pow2_utils.cu b/cpp/tests/util/pow2_utils.cu
similarity index 100%
rename from cpp/test/util/pow2_utils.cu
rename to cpp/tests/util/pow2_utils.cu
diff --git a/cpp/test/util/reduction.cu b/cpp/tests/util/reduction.cu
similarity index 100%
rename from cpp/test/util/reduction.cu
rename to cpp/tests/util/reduction.cu
diff --git a/dependencies.yaml b/dependencies.yaml
index 1772c5d539..c9befcb53a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,25 +3,26 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.5"]
+      cuda: ["11.8", "12.8"]
       arch: [x86_64, aarch64]
     includes:
-      - rapids_build
-      - build_pylibraft
+      - build_common
+      - build_cython
+      - checks
       - cuda
       - cuda_version
+      - depends_on_cuda_python
       - depends_on_cupy
       - depends_on_distributed_ucxx
+      - depends_on_rmm
       - develop
-      - checks
-      - test_libraft
       - docs
-      - rapids_build_setuptools
       - rapids_build_skbuild
-      - run_raft_dask
       - run_pylibraft
-      - test_python_common
+      - run_raft_dask
+      - test_libraft
       - test_pylibraft
+      - test_python_common
   test_cpp:
     output: none
     includes:
@@ -31,10 +32,10 @@ files:
     output: none
     includes:
       - cuda_version
+      - depends_on_cupy
       - py_version
-      - test_python_common
       - test_pylibraft
-      - depends_on_cupy
+      - test_python_common
   checks:
     output: none
     includes:
@@ -48,6 +49,29 @@ files:
       - docs
       - py_version
       - test_pylibraft
+  py_build_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_rapids_build_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - build_common
+      - depends_on_librmm
+  py_run_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: project
+    includes:
+      - cuda_wheels
   py_build_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -62,15 +86,21 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
-      - rapids_build
-      - build_pylibraft
+      - build_common
+      - build_cython
+      - depends_on_libraft
+      - depends_on_librmm
+      - depends_on_cuda_python
+      - depends_on_rmm
   py_run_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
     extras:
       table: project
     includes:
-      - cuda_wheels
+      - depends_on_libraft
+      - depends_on_cuda_python
+      - depends_on_rmm
       - run_pylibraft
   py_test_pylibraft:
     output: pyproject
@@ -79,9 +109,9 @@ files:
       table: project.optional-dependencies
       key: test
     includes:
-      - test_python_common
-      - test_pylibraft
       - depends_on_cupy
+      - test_pylibraft
+      - test_python_common
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -96,7 +126,10 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
-      - rapids_build
+      - build_common
+      - build_cython
+      - depends_on_libraft
+      - depends_on_librmm
       - depends_on_ucx_build
   py_run_raft_dask:
     output: pyproject
@@ -104,8 +137,9 @@ files:
     extras:
       table: project
     includes:
-      - run_raft_dask
       - depends_on_distributed_ucxx
+      - depends_on_libraft
+      - run_raft_dask
   py_test_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -125,39 +159,53 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
+          - rapids-build-backend>=0.3.0,<0.4.0.dev0
       - output_types: [conda]
         packages:
           - scikit-build-core>=0.10.0
       - output_types: [requirements, pyproject]
         packages:
           - scikit-build-core[pyproject]>=0.10.0
-  rapids_build:
+  build_common:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4,!=3.30.0
-          - cython>=3.0.0,<3.1.0a0
           - ninja
       - output_types: [conda]
         packages:
           - c-compiler
           - cxx-compiler
+          - libucxx==0.42.*,>=0.0.0a0
           - nccl>=2.19
-          - libucxx==0.41.*,>=0.0.0a0
+          - spdlog>=1.14.1,<1.15
     specific:
       - output_types: conda
         matrices:
           - matrix:
               arch: x86_64
+              cuda: "11.8"
             packages:
               - gcc_linux-64=11.*
-              - sysroot_linux-64==2.17
+              - sysroot_linux-64==2.28
           - matrix:
               arch: aarch64
+              cuda: "11.8"
             packages:
               - gcc_linux-aarch64=11.*
-              - sysroot_linux-aarch64==2.17
+              - sysroot_linux-aarch64==2.28
+          - matrix:
+              arch: x86_64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-64=13.*
+              - sysroot_linux-64==2.28
+          - matrix:
+              arch: aarch64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-aarch64=13.*
+              - sysroot_linux-aarch64==2.28
       - output_types: conda
         matrices:
           - matrix: {cuda: "12.*"}
@@ -178,45 +226,11 @@ dependencies:
             packages: [nvcc_linux-64=11.2]
           - matrix: {cuda: "11.2", arch: aarch64}
             packages: [nvcc_linux-aarch64=11.2]
-
-  build_pylibraft:
+  build_cython:
     common:
-      - output_types: [conda]
-        packages:
-          - &rmm_unsuffixed rmm==24.12.*,>=0.0.0a0
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
       - output_types: [conda, requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
-          - matrix:
-              cuda: "11.*"
-            packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
-          - matrix:
-            packages:
-              - &cuda_python cuda-python
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu12 rmm-cu12==24.12.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu11 rmm-cu11==24.12.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed] }
+        packages:
+          - cython>=3.0.0,<3.1.0a0
   checks:
     common:
       - output_types: [conda, requirements]
@@ -260,6 +274,10 @@ dependencies:
               cuda: "12.5"
             packages:
               - cuda-version=12.5
+          - matrix:
+              cuda: "12.8"
+            packages:
+              - cuda-version=12.8
   cuda:
     specific:
       - output_types: conda
@@ -344,11 +362,14 @@ dependencies:
               - nvidia-curand-cu12
               - nvidia-cusolver-cu12
               - nvidia-cusparse-cu12
-          # CUDA 11 does not provide wheels, so use the system libraries instead
           - matrix:
               cuda: "11.*"
               use_cuda_wheels: "true"
             packages:
+              - nvidia-cublas-cu11
+              - nvidia-curand-cu11
+              - nvidia-cusolver-cu11
+              - nvidia-cusparse-cu11
           # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
           # (e.g. for DLFW and pip devcontainers)
           - matrix:
@@ -397,13 +418,6 @@ dependencies:
           - recommonmark
           - sphinx-copybutton
           - sphinx-markdown-tables
-  rapids_build_setuptools:
-    common:
-      - output_types: [requirements, pyproject]
-        packages:
-          - wheel
-          - setuptools
-          - *rapids_build_backend
   py_version:
     specific:
       - output_types: conda
@@ -428,58 +442,95 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - numpy>=1.23,<3.0a0
-      - output_types: [conda]
+  run_raft_dask:
+    common:
+      - output_types: [conda, pyproject]
+        packages:
+          - dask-cuda==25.2.*,>=0.0.0a0
+          - rapids-dask-dependency==25.2.*,>=0.0.0a0
+      - output_types: conda
         packages:
-          - *rmm_unsuffixed
+          - &pylibraft_unsuffixed pylibraft==25.2.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.42.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for cudf and rmm.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - &pylibraft_cu12 pylibraft-cu12==25.2.*,>=0.0.0a0
+              - &ucx_py_cu12 ucx-py-cu12==0.42.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - &pylibraft_cu11 pylibraft-cu11==25.2.*,>=0.0.0a0
+              - &ucx_py_cu11 ucx-py-cu11==0.42.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibraft_unsuffixed, *ucx_py_unsuffixed]}
+  test_python_common:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - pytest==7.*
+          - pytest-cov
+  test_pylibraft:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - scikit-learn
+          - scipy
+  depends_on_cuda_python:
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
             packages:
-              - *cuda_python12
+              - cuda-python>=12.6.2,<13.0a0
           - matrix:
               cuda: "11.*"
             packages:
-              - *cuda_python11
+              - cuda-python>=11.8.5,<12.0a0
           - matrix:
             packages:
-              - *cuda_python
+              - cuda-python
+  depends_on_distributed_ucxx:
+    common:
+      - output_types: conda
+        packages:
+          # UCXX is not currently a hard-dependency thus only installed during tests,
+          # this will change in the future.
+          - &distributed_ucxx_unsuffixed distributed-ucxx==0.42.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *rmm_cu12
+              - distributed-ucxx-cu12==0.42.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *rmm_cu11
-          - {matrix: null, packages: [*rmm_unsuffixed]}
-  run_raft_dask:
+              - distributed-ucxx-cu11==0.42.*,>=0.0.0a0
+          - {matrix: null, packages: [*distributed_ucxx_unsuffixed]}
+  depends_on_libraft:
     common:
-      - output_types: [conda, pyproject]
-        packages:
-          - dask-cuda==24.12.*,>=0.0.0a0
-          - joblib>=0.11
-          - numba>=0.57
-          - rapids-dask-dependency==24.12.*,>=0.0.0a0
-      - output_types: conda
-        packages:
-          - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0
-          - &ucx_py_unsuffixed ucx-py==0.41.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for cudf and rmm.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -489,34 +540,46 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &pylibraft_cu12 pylibraft-cu12==24.12.*,>=0.0.0a0
-              - &ucx_py_cu12 ucx-py-cu12==0.41.*,>=0.0.0a0
+              - libraft-cu12==25.2.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.12.*,>=0.0.0a0
-              - &ucx_py_cu11 ucx-py-cu11==0.41.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibraft_unsuffixed, *ucx_py_unsuffixed]}
-  test_python_common:
+              - libraft-cu11==25.2.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - libraft==25.2.*,>=0.0.0a0
+  depends_on_librmm:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: conda
         packages:
-          - pytest==7.*
-          - pytest-cov
-  test_pylibraft:
-    common:
-      - output_types: [conda, requirements, pyproject]
+          - &librmm_unsuffixed librmm==25.2.*,>=0.0.0a0
+      - output_types: requirements
         packages:
-          - scikit-learn
-          - scipy
-  depends_on_distributed_ucxx:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu11==25.2.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *librmm_unsuffixed
+  depends_on_rmm:
     common:
       - output_types: conda
         packages:
-          # UCXX is not currently a hard-dependency thus only installed during tests,
-          # this will change in the future.
-          - &distributed_ucxx_unsuffixed distributed-ucxx==0.41.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -529,13 +592,15 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - distributed-ucxx-cu12==0.41.*,>=0.0.0a0
+              - rmm-cu12==25.2.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - distributed-ucxx-cu11==0.41.*,>=0.0.0a0
-          - {matrix: null, packages: [*distributed_ucxx_unsuffixed]}
+              - rmm-cu11==25.2.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *rmm_unsuffixed
   depends_on_ucx_build:
     common:
       - output_types: conda
diff --git a/docs/README.md b/docs/README.md
index a09ccf41eb..aa5e114347 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -11,4 +11,4 @@ bash build.sh docs
 #### Once the process finishes, documentation can be found in build/html
 ```shell script
 xdg-open build/html/index.html`
-```
\ No newline at end of file
+```
diff --git a/docs/source/_static/references.css b/docs/source/_static/references.css
index 225cf13ba9..d1f647233a 100644
--- a/docs/source/_static/references.css
+++ b/docs/source/_static/references.css
@@ -20,4 +20,4 @@ dl.citation > dt.label > span::before {
 /* Add closing bracket */
 dl.citation > dt.label > span::after {
   content: "]";
-}
\ No newline at end of file
+}
diff --git a/docs/source/build.md b/docs/source/build.md
index 5a0dbf7e11..237c54ce6b 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -42,7 +42,7 @@ mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-vers
 
 ```bash
 # for CUDA 12.0
-mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.0
+mamba install -c rapidsai -c conda-forge -c nvidia raft-dask pylibraft cuda-version=12.8
 ```
 
 Note that the above commands will also install `libraft-headers` and `libraft`.
@@ -50,7 +50,7 @@ Note that the above commands will also install `libraft-headers` and `libraft`.
 You can also install the conda packages individually using the `mamba` command above. For example, if you'd like to install RAFT's headers to use in your project:
 ```bash
 # for CUDA 12.0
-mamba install -c rapidsai -c conda-forge -c nvidia libraft-headers cuda-version=12.0
+mamba install -c rapidsai -c conda-forge -c nvidia libraft-headers cuda-version=12.8
 ```
 
 ## Installing Python through Pip
@@ -99,7 +99,7 @@ In addition to the libraries included with cudatoolkit 11.8+, there are some oth
 
 Conda environment scripts are provided for installing the necessary dependencies to build both the C++ and Python libraries from source. It is preferred to use `mamba`, as it provides significant speedup over `conda`:
 ```bash
-mamba env create --name rapids_raft -f conda/environments/all_cuda-125_arch-x86_64.yaml
+mamba env create --name rapids_raft -f conda/environments/all_cuda-128_arch-x86_64.yaml
 mamba activate rapids_raft
 ```
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7a287b689f..e5e6e0871a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -208,7 +208,7 @@ def setup(app):
 linkcode_resolve = make_linkcode_resolve(
     "pylibraft",
     "https://github.com/rapidsai/raft"
-    "raft/blob/{revision}/python/pylibraft"
+    "/blob/{revision}/python/pylibraft/"
     "{package}/{path}#L{lineno}",
 )
 
diff --git a/docs/source/contributing.md b/docs/source/contributing.md
index 1b4071d0a5..446e7b2a7b 100755
--- a/docs/source/contributing.md
+++ b/docs/source/contributing.md
@@ -89,5 +89,3 @@ implementation of the issue, ask them in the issue instead of the PR.
 
 ## Attribution
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
-
-
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 74f706bf46..837cfa0cb0 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -16,4 +16,4 @@ C++ API
    cpp_api/solver.rst
    cpp_api/sparse.rst
    cpp_api/stats.rst
-   cpp_api/utils.rst
\ No newline at end of file
+   cpp_api/utils.rst
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 4122a18506..f159c85af8 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -22,4 +22,4 @@ expose in public APIs.
    core_operators.rst
    core_math.rst
    core_bitset.rst
-   core_bitmap.rst
\ No newline at end of file
+   core_bitmap.rst
diff --git a/docs/source/cpp_api/core_bitmap.rst b/docs/source/cpp_api/core_bitmap.rst
index 6c1dc607bf..532da58e71 100644
--- a/docs/source/cpp_api/core_bitmap.rst
+++ b/docs/source/cpp_api/core_bitmap.rst
@@ -12,4 +12,4 @@ namespace *raft::core*
 .. doxygengroup:: bitmap
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
diff --git a/docs/source/cpp_api/core_bitset.rst b/docs/source/cpp_api/core_bitset.rst
index af1cff6d37..117efc5466 100644
--- a/docs/source/cpp_api/core_bitset.rst
+++ b/docs/source/cpp_api/core_bitset.rst
@@ -12,4 +12,4 @@ namespace *raft::core*
 .. doxygengroup:: bitset
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
diff --git a/docs/source/cpp_api/core_kvp.rst b/docs/source/cpp_api/core_kvp.rst
index 60a0da078b..5f0cfd800a 100644
--- a/docs/source/cpp_api/core_kvp.rst
+++ b/docs/source/cpp_api/core_kvp.rst
@@ -12,4 +12,3 @@ namespace *raft::core*
 .. doxygenstruct:: raft::KeyValuePair
     :project: RAFT
     :members:
-
diff --git a/docs/source/cpp_api/core_logger.rst b/docs/source/cpp_api/core_logger.rst
index 60714a63ea..569f17fac3 100644
--- a/docs/source/cpp_api/core_logger.rst
+++ b/docs/source/cpp_api/core_logger.rst
@@ -12,4 +12,3 @@ namespace *raft::core*
 .. doxygenclass:: raft::logger
     :project: RAFT
     :members:
-
diff --git a/docs/source/cpp_api/core_nvtx.rst b/docs/source/cpp_api/core_nvtx.rst
index addcbdda30..051c66da0c 100644
--- a/docs/source/cpp_api/core_nvtx.rst
+++ b/docs/source/cpp_api/core_nvtx.rst
@@ -13,5 +13,3 @@ namespace *raft::core*
     :project: RAFT
     :members:
     :content-only:
-
-
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index 3cd928c9db..b9da44e431 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -4,7 +4,7 @@ Linear Algebra
 This page provides C++ class references for the publicly-exposed elements of the `raft/linalg` (dense) linear algebra headers.
 In addition to providing highly optimized arithmetic and matrix/vector operations, RAFT provides a consistent user experience
 by providing common BLAS routines, standard linear system solvers, factorization and eigenvalue solvers. Some of these routines
-hide the complexities of lower-level C-based libraries provided in the CUDA toolkit 
+hide the complexities of lower-level C-based libraries provided in the CUDA toolkit
 
 .. role:: py(code)
    :language: c++
@@ -19,4 +19,4 @@ hide the complexities of lower-level C-based libraries provided in the CUDA tool
    linalg_map_reduce.rst
    linalg_matrix.rst
    linalg_matrix_vector.rst
-   linalg_solver.rst
\ No newline at end of file
+   linalg_solver.rst
diff --git a/docs/source/cpp_api/linalg_arithmetic.rst b/docs/source/cpp_api/linalg_arithmetic.rst
index 7bc428b9f0..badb9f31a5 100644
--- a/docs/source/cpp_api/linalg_arithmetic.rst
+++ b/docs/source/cpp_api/linalg_arithmetic.rst
@@ -114,4 +114,3 @@ namespace *raft::linalg*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/linalg_matrix.rst b/docs/source/cpp_api/linalg_matrix.rst
index e6024bcd02..30eef5f64f 100644
--- a/docs/source/cpp_api/linalg_matrix.rst
+++ b/docs/source/cpp_api/linalg_matrix.rst
@@ -16,4 +16,3 @@ namespace *raft::linalg*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/linalg_matrix_vector.rst b/docs/source/cpp_api/linalg_matrix_vector.rst
index d92a3c9874..cc22327c74 100644
--- a/docs/source/cpp_api/linalg_matrix_vector.rst
+++ b/docs/source/cpp_api/linalg_matrix_vector.rst
@@ -29,4 +29,3 @@ namespace *raft::linalg*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/matrix_manipulation.rst b/docs/source/cpp_api/matrix_manipulation.rst
index d0da51e4b7..5437ced99f 100644
--- a/docs/source/cpp_api/matrix_manipulation.rst
+++ b/docs/source/cpp_api/matrix_manipulation.rst
@@ -41,4 +41,3 @@ namespace *raft::matrix*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/matrix_reduction.rst b/docs/source/cpp_api/matrix_reduction.rst
index 440a1528b4..92dcea6428 100644
--- a/docs/source/cpp_api/matrix_reduction.rst
+++ b/docs/source/cpp_api/matrix_reduction.rst
@@ -16,4 +16,4 @@ namespace *raft::matrix*
 .. doxygengroup:: matrix_norm
     :project: RAFT
     :members:
-    :content-only:
\ No newline at end of file
+    :content-only:
diff --git a/docs/source/cpp_api/mdspan_representation.rst b/docs/source/cpp_api/mdspan_representation.rst
index 386e6f14e9..939f1d51be 100644
--- a/docs/source/cpp_api/mdspan_representation.rst
+++ b/docs/source/cpp_api/mdspan_representation.rst
@@ -66,5 +66,3 @@ Accessors
 
 .. doxygentypedef:: raft::managed_accessor
     :project: RAFT
-
-
diff --git a/docs/source/cpp_api/mdspan_span.rst b/docs/source/cpp_api/mdspan_span.rst
index 870c4329d0..1b7d749810 100644
--- a/docs/source/cpp_api/mdspan_span.rst
+++ b/docs/source/cpp_api/mdspan_span.rst
@@ -25,4 +25,3 @@ span: One-dimensional Non-owning View
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/mnmg.rst b/docs/source/cpp_api/mnmg.rst
index 9543cbb4ee..1f9f75dd46 100644
--- a/docs/source/cpp_api/mnmg.rst
+++ b/docs/source/cpp_api/mnmg.rst
@@ -47,4 +47,3 @@ NCCL+UCX Comms
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/random.rst b/docs/source/cpp_api/random.rst
index 9f5cdc7a74..8eaa82c0b0 100644
--- a/docs/source/cpp_api/random.rst
+++ b/docs/source/cpp_api/random.rst
@@ -26,4 +26,3 @@ namespace *raft::random*
    random_sampling_univariate.rst
    random_sampling_multivariable.rst
    random_sampling_without_replacement.rst
-
diff --git a/docs/source/cpp_api/random_datagen.rst b/docs/source/cpp_api/random_datagen.rst
index a07f5e0154..e97283598e 100644
--- a/docs/source/cpp_api/random_datagen.rst
+++ b/docs/source/cpp_api/random_datagen.rst
@@ -43,4 +43,3 @@ namespace *raft::random*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/random_sampling_without_replacement.rst b/docs/source/cpp_api/random_sampling_without_replacement.rst
index ac0d3bea86..af5281a48b 100644
--- a/docs/source/cpp_api/random_sampling_without_replacement.rst
+++ b/docs/source/cpp_api/random_sampling_without_replacement.rst
@@ -22,5 +22,3 @@ namespace *raft::random*
     :project: RAFT
     :members:
     :content-only:
-
-
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index 64197accaf..ee170b3721 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -16,4 +16,3 @@ Core to RAFT's computational patterns for sparse data is its vocabulary of spars
    sparse_linalg.rst
    sparse_matrix.rst
    sparse_solver.rst
-
diff --git a/docs/source/cpp_api/sparse_types_coo_matrix.rst b/docs/source/cpp_api/sparse_types_coo_matrix.rst
index 855d89fdea..c1d8748a64 100644
--- a/docs/source/cpp_api/sparse_types_coo_matrix.rst
+++ b/docs/source/cpp_api/sparse_types_coo_matrix.rst
@@ -36,4 +36,3 @@ Host COO Matrix
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/sparse_types_csr_matrix.rst b/docs/source/cpp_api/sparse_types_csr_matrix.rst
index b704846c4e..22898a6399 100644
--- a/docs/source/cpp_api/sparse_types_csr_matrix.rst
+++ b/docs/source/cpp_api/sparse_types_csr_matrix.rst
@@ -36,4 +36,3 @@ Host CSR Matrix
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/stats_classification.rst b/docs/source/cpp_api/stats_classification.rst
index 929d2808f3..bc472c831d 100644
--- a/docs/source/cpp_api/stats_classification.rst
+++ b/docs/source/cpp_api/stats_classification.rst
@@ -17,4 +17,3 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/stats_probability.rst b/docs/source/cpp_api/stats_probability.rst
index 457879d87c..a77a0d9132 100644
--- a/docs/source/cpp_api/stats_probability.rst
+++ b/docs/source/cpp_api/stats_probability.rst
@@ -53,4 +53,3 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
-
diff --git a/docs/source/cpp_api/stats_regression.rst b/docs/source/cpp_api/stats_regression.rst
index 8c172b441d..fed5f806a4 100644
--- a/docs/source/cpp_api/stats_regression.rst
+++ b/docs/source/cpp_api/stats_regression.rst
@@ -41,5 +41,3 @@ namespace *raft::stats*
     :project: RAFT
     :members:
     :content-only:
-
-
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index c4a099fabb..1a2626f2b2 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,13 +205,13 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
 Manually, run the following to bulk-fix include style issues:
 ```bash
-python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list of folders which you want to fix]
+python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/tests ... list of folders which you want to fix]
 ```
 
 ### Copyright header
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
@@ -256,14 +256,14 @@ There are 7 logging levels with each successive level becoming quieter:
 7. RAFT_LEVEL_OFF
    Pass one of these as per your needs into the `set_level()` method as follows:
 ```cpp
-raft::logger::get().set_level(RAFT_LEVEL_WARN);
+raft::default_logger().set_level(RAFT_LEVEL_WARN);
 // From now onwards, this will print only WARN and above kind of messages
 ```
 
 ### Changing logging pattern
 Pass the [format string](https://github.com/gabime/spdlog/wiki/3.-Custom-formatting) as follows in order use a different logging pattern than the default.
 ```cpp
-raft::logger::get.set_pattern(YourFavoriteFormat);
+raft::default_logger().set_pattern(YourFavoriteFormat);
 ```
 One can also use the corresponding `get_pattern()` method to know the current format as well.
 
@@ -298,9 +298,9 @@ RAFT is a heavily templated library. Several core functions are expensive to com
 
 **Macros.** We define the macros `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY`. The `RAFT_COMPILED` macro is defined by `CMake` when compiling code that (1) is part of `libraft.so` or (2) is linked with `libraft.so`. It indicates that a precompiled `libraft.so` is present at runtime.
 
-The `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro is defined by `CMake` during compilation of `libraft.so` itself. When defined, it indicates that implicit instantiations of expensive function templates are forbidden (they result in a compiler error). In the RAFT project, we additionally define this macro during compilation of the tests and benchmarks. 
+The `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro is defined by `CMake` during compilation of `libraft.so` itself. When defined, it indicates that implicit instantiations of expensive function templates are forbidden (they result in a compiler error). In the RAFT project, we additionally define this macro during compilation of the tests and benchmarks.
 
-Below, we summarize which combinations of `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` are used in practice and what the effect of the combination is. 
+Below, we summarize which combinations of `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` are used in practice and what the effect of the combination is.
 
 | RAFT_COMPILED | RAFT_EXPLICIT_INSTANTIATE_ONLY | Which targets                                                                                        |
 |---------------|--------------------------------|------------------------------------------------------------------------------------------------------|
@@ -349,7 +349,7 @@ The file `expensive-ext.cuh` contains the following:
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 namespace raft {
-// (1) define templates to raise an error in case of accidental instantiation 
+// (1) define templates to raise an error in case of accidental instantiation
 template <typename T> void expensive(T arg) RAFT_EXPLICIT;
 } // namespace raft
 #endif //RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -371,7 +371,7 @@ template void raft::expensive<int>(int);
 template void raft::expensive<float>(float);
 ```
 
-**Design considerations**: 
+**Design considerations**:
 
 1. In the `-ext.cuh` header, do not include implementation headers. Only include function parameter types and types that are used to instantiate the templates. If a primitive takes custom parameter types, define them in a separate header called `<primitive_name>_types.hpp`. (see [Common Design Considerations](https://github.com/rapidsai/raft/blob/7b065aff81a0b1976e2a9e2f3de6690361a1111b/docs/source/developer_guide.md#common-design-considerations)).
 
@@ -381,7 +381,7 @@ template void raft::expensive<float>(float);
 
 4. If a header file defines multiple expensive templates, it can be that one of them is not instantiated. In this case, **do define** the template with `RAFT_EXPLICIT` in the `-ext` header. This way, when the template is instantiated, the developer gets a helpful error message instead of a confusing "function not found".
 
-This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation. 
+This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation.
 
 ## Testing
 
diff --git a/docs/source/pylibraft_api/random.rst b/docs/source/pylibraft_api/random.rst
index 538d932757..dbfd7b2fa1 100644
--- a/docs/source/pylibraft_api/random.rst
+++ b/docs/source/pylibraft_api/random.rst
@@ -9,4 +9,4 @@ This page provides pylibraft class references for the publicly-exposed elements
    :class: highlight
 
 
-.. autofunction:: pylibraft.random.rmat
\ No newline at end of file
+.. autofunction:: pylibraft.random.rmat
diff --git a/docs/source/pylibraft_api/sparse.rst b/docs/source/pylibraft_api/sparse.rst
index b2c3f7a2b1..9ba265c6c9 100644
--- a/docs/source/pylibraft_api/sparse.rst
+++ b/docs/source/pylibraft_api/sparse.rst
@@ -8,4 +8,4 @@ This page provides pylibraft class references for the publicly-exposed elements
    :language: python
    :class: highlight
 
-.. autofunction:: pylibraft.sparse.linalg.eigsh
\ No newline at end of file
+.. autofunction:: pylibraft.sparse.linalg.eigsh
diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py
index a7a46fdd9d..5712bbe5cb 100644
--- a/docs/source/sphinxext/github_link.py
+++ b/docs/source/sphinxext/github_link.py
@@ -1,5 +1,20 @@
 # This contains code with copyright by the scikit-learn project, subject to the
 # license in /thirdparty/LICENSES/LICENSE.scikit_learn
+#
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 import inspect
 import os
@@ -96,15 +111,14 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision):
             # fn is expected to be the absolute path.
             fn = os.path.relpath(source_file, start=package)
             print("{}:{}".format(
-                os.path.abspath(os.path.join("..", "python", "cuml", fn)),
+                os.path.abspath(os.path.join("..", "python", "pylibraft", fn)),
                 lineno))
         else:
             return
     else:
-        # Test if we are absolute or not (pyx are relative)
-        if (not os.path.isabs(fn)):
-            # Should be relative to docs right now
-            fn = os.path.abspath(os.path.join("..", "python", fn))
+        if fn.endswith(".pyx"):
+            sp_path = next(x for x in sys.path if re.match(".*site-packages$", x))
+            fn = fn.replace("/opt/conda/conda-bld/work/python/pylibraft", sp_path)
 
         # Convert to relative from module root
         fn = os.path.relpath(fn,
diff --git a/pyproject.toml b/pyproject.toml
index 5042113388..460c0312a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ ignore_missing_imports = true
 # they are imported by a checked file.
 follow_imports = "skip"
 exclude = [
-    "pylibraft/pylibraft/test",
+    "pylibraft/pylibraft/tests",
  ]
 
 [tool.codespell]
@@ -45,6 +45,6 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,numer"
+ignore-words-list = "inout,unparseable,numer"
 builtin = "clear"
 quiet-level = 3
diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt
new file mode 100644
index 0000000000..db81aa9507
--- /dev/null
+++ b/python/libraft/CMakeLists.txt
@@ -0,0 +1,56 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+project(
+  libraft-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX
+)
+
+# Check if raft is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python raft package.
+find_package(raft "${RAPIDS_VERSION}")
+
+if(raft_FOUND)
+  return()
+endif()
+
+unset(raft_FOUND)
+
+# --- CUDA --- #
+set(CUDA_STATIC_RUNTIME ON)
+set(CUDA_STATIC_MATH_LIBRARIES OFF)
+
+# --- RAFT ---#
+set(BUILD_TESTS OFF)
+set(BUILD_PRIMS_BENCH OFF)
+set(RAFT_COMPILE_DYNAMIC_ONLY ON)
+set(RAFT_COMPILE_LIBRARY ON)
+
+add_subdirectory(../../cpp raft-cpp)
+
+# assumes libraft.so is installed 2 levels deep, e.g. site-packages/libraft/lib64/libraft.so
+set_property(
+  TARGET raft_lib
+  PROPERTY INSTALL_RPATH
+           "$ORIGIN/../../nvidia/cublas/lib"
+           "$ORIGIN/../../nvidia/curand/lib"
+           "$ORIGIN/../../nvidia/cusolver/lib"
+           "$ORIGIN/../../nvidia/cusparse/lib"
+           "$ORIGIN/../../nvidia/nvjitlink/lib"
+)
diff --git a/python/libraft/LICENSE b/python/libraft/LICENSE
new file mode 120000
index 0000000000..30cff7403d
--- /dev/null
+++ b/python/libraft/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libraft/README.md b/python/libraft/README.md
new file mode 120000
index 0000000000..fe84005413
--- /dev/null
+++ b/python/libraft/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/libraft/libraft/VERSION b/python/libraft/libraft/VERSION
new file mode 120000
index 0000000000..d62dc733ef
--- /dev/null
+++ b/python/libraft/libraft/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libraft/libraft/__init__.py b/python/libraft/libraft/__init__.py
new file mode 100644
index 0000000000..9260f4e67c
--- /dev/null
+++ b/python/libraft/libraft/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libraft._version import __git_commit__, __version__
+from libraft.load import load_library
diff --git a/python/libraft/libraft/_version.py b/python/libraft/libraft/_version.py
new file mode 100644
index 0000000000..530bf8bea6
--- /dev/null
+++ b/python/libraft/libraft/_version.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/libraft/libraft/load.py b/python/libraft/libraft/load.py
new file mode 100644
index 0000000000..ad3db9e09c
--- /dev/null
+++ b/python/libraft/libraft/load.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+# Loading with RTLD_LOCAL adds the library itself to the loader's
+# loaded library cache without loading any symbols into the global
+# namespace. This allows libraries that express a dependency on
+# this library to be loaded later and successfully satisfy this dependency
+# without polluting the global symbol table with symbols from
+# libraft that could conflict with symbols from other DSOs.
+PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL
+
+
+def _load_system_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Raises ``OSError`` if library cannot be loaded.
+    """
+    return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG)
+
+
+def _load_wheel_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Returns ``None`` if the library cannot be loaded.
+    """
+    if os.path.isfile(
+        lib := os.path.join(os.path.dirname(__file__), "lib64", soname)
+    ):
+        return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG)
+    return None
+
+
+def load_library():
+    """Dynamically load libraft.so and its dependencies"""
+    prefer_system_installation = (
+        os.getenv("RAPIDS_LIBRAFT_PREFER_SYSTEM_LIBRARY", "false").lower()
+        != "false"
+    )
+
+    soname = "libraft.so"
+    libraft_lib = None
+    if prefer_system_installation:
+        # Prefer a system library if one is present to
+        # avoid clobbering symbols that other packages might expect, but if no
+        # other library is present use the one in the wheel.
+        try:
+            libraft_lib = _load_system_installation(soname)
+        except OSError:
+            libraft_lib = _load_wheel_installation(soname)
+    else:
+        # Prefer the libraries bundled in this package. If they aren't found
+        # (which might be the case in builds where the library was prebuilt
+        # before packaging the wheel), look for a system installation.
+        try:
+            libraft_lib = _load_wheel_installation(soname)
+            if libraft_lib is None:
+                libraft_lib = _load_system_installation(soname)
+        except OSError:
+            # If none of the searches above succeed, just silently return None
+            # and rely on other mechanisms (like RPATHs on other DSOs) to
+            # help the loader find the library.
+            pass
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libraft was loaded from.
+    return libraft_lib
diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml
new file mode 100644
index 0000000000..89b2834614
--- /dev/null
+++ b/python/libraft/pyproject.toml
@@ -0,0 +1,117 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+build-backend = "rapids_build_backend.build"
+
+[project]
+name = "libraft"
+dynamic = ["version"]
+description = "RAFT: Reusable Algorithms Functions and other Tools (C++)"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.10"
+dependencies = [
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/raft"
+Documentation = "https://docs.rapids.ai/api/raft/stable/"
+
+[project.entry-points."cmake.prefix"]
+libraft = "libraft"
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_first_party = [
+    "libraft",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.install-dir = "libraft"
+wheel.packages = ["libraft"]
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libraft/VERSION"
+regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "librmm==25.2.*,>=0.0.0a0",
+    "ninja",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
diff --git a/python/pylibraft/.coveragerc b/python/pylibraft/.coveragerc
index fc087fb9c5..3269e10b8a 100644
--- a/python/pylibraft/.coveragerc
+++ b/python/pylibraft/.coveragerc
@@ -1,3 +1,3 @@
 # Configuration file for Python coverage tests
 [run]
-source = pylibraft
\ No newline at end of file
+source = pylibraft
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 758c1e4711..83c262dc10 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -27,68 +27,13 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       ON
-)
-option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
-
-# If the user requested it we attempt to find RAFT.
-if(FIND_RAFT_CPP)
-  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS compiled)
-  if(NOT TARGET raft::raft_lib)
-    message(
-      FATAL_ERROR
-        "Building against a preexisting libraft library requires the compiled libraft to have been built!"
-    )
-
-  endif()
-else()
-  set(raft_FOUND OFF)
-endif()
+# an installed version of raft contains the other necessary targets (like CCCL and cuco)
+find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS raft compiled)
 
 include(rapids-cython-core)
 
-if(NOT raft_FOUND)
-  find_package(CUDAToolkit REQUIRED)
-
-  set(BUILD_TESTS OFF)
-  set(BUILD_PRIMS_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARY ON)
-  set(CUDA_STATIC_RUNTIME ON)
-  set(CUDA_STATIC_MATH_LIBRARIES ON)
-  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
-    set(CUDA_STATIC_MATH_LIBRARIES OFF)
-  elseif(USE_CUDA_MATH_WHEELS)
-    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
-  endif()
-
-  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
-
-  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
-    set_property(
-      TARGET raft_lib
-      PROPERTY INSTALL_RPATH
-               "$ORIGIN/../nvidia/cublas/lib"
-               "$ORIGIN/../nvidia/curand/lib"
-               "$ORIGIN/../nvidia/cusolver/lib"
-               "$ORIGIN/../nvidia/cusparse/lib"
-               "$ORIGIN/../nvidia/nvjitlink/lib"
-    )
-  endif()
-
-  # When building the C++ libraries from source we must copy libraft.so alongside the
-  # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
-  # library, we shouldn't need this
-  set(cython_lib_dir pylibraft)
-  install(TARGETS raft_lib DESTINATION ${cython_lib_dir})
-endif()
-
 rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/sparse)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET raft PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index b0869501f3..a01e02ec33 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -13,4 +13,15 @@
 # limitations under the License.
 #
 
+# If libraft was installed as a wheel, we must request it to load the library
+# symbols. Otherwise, we assume that the library was installed in a system path that ld
+# can find.
+try:
+    import libraft
+except ModuleNotFoundError:
+    pass
+else:
+    libraft.load_library()
+    del libraft
+
 from pylibraft._version import __git_commit__, __version__
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index 53279bfaf7..d1c1acb3aa 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX common_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX common_
 )
diff --git a/python/pylibraft/pylibraft/common/cuda.pxd b/python/pylibraft/pylibraft/common/cuda.pxd
index a44d9aeb63..934573b51f 100644
--- a/python/pylibraft/pylibraft/common/cuda.pxd
+++ b/python/pylibraft/pylibraft/common/cuda.pxd
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 cdef class Stream:
diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
index c164a463ae..cda0fc7168 100644
--- a/python/pylibraft/pylibraft/common/cuda.pyx
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -19,7 +19,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cuda.ccudart cimport (
+from cuda.bindings.cyruntime cimport (
     cudaError_t,
     cudaGetErrorName,
     cudaGetErrorString,
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index d256e671bf..400b667789 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -21,7 +21,7 @@
 
 import functools
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 
 from rmm.librmm.cuda_stream_view cimport (
diff --git a/python/pylibraft/pylibraft/common/interruptible.pyx b/python/pylibraft/pylibraft/common/interruptible.pyx
index c489f2ee20..ceac387f58 100644
--- a/python/pylibraft/pylibraft/common/interruptible.pyx
+++ b/python/pylibraft/pylibraft/common/interruptible.pyx
@@ -22,7 +22,7 @@
 import contextlib
 import signal
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 from cython.operator cimport dereference
 
 from rmm.librmm.cuda_stream_view cimport cuda_stream_view
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index 10ff776471..7d61855111 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -23,5 +23,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX random_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX random_
 )
diff --git a/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt b/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt
index ef16981644..7b2c9f6162 100644
--- a/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt
@@ -23,5 +23,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX sparse_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX sparse_
 )
diff --git a/python/pylibraft/pylibraft/test/__init__py b/python/pylibraft/pylibraft/tests/__init__py
similarity index 100%
rename from python/pylibraft/pylibraft/test/__init__py
rename to python/pylibraft/pylibraft/tests/__init__py
diff --git a/python/pylibraft/pylibraft/test/pytest.ini b/python/pylibraft/pylibraft/tests/pytest.ini
similarity index 98%
rename from python/pylibraft/pylibraft/test/pytest.ini
rename to python/pylibraft/pylibraft/tests/pytest.ini
index bf70c06f84..7b0a9f29fb 100644
--- a/python/pylibraft/pylibraft/test/pytest.ini
+++ b/python/pylibraft/pylibraft/tests/pytest.ini
@@ -2,4 +2,3 @@
 
 [pytest]
 addopts = --tb=native
-
diff --git a/python/pylibraft/pylibraft/test/test_cai_wrapper.py b/python/pylibraft/pylibraft/tests/test_cai_wrapper.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_cai_wrapper.py
rename to python/pylibraft/pylibraft/tests/test_cai_wrapper.py
diff --git a/python/pylibraft/pylibraft/test/test_config.py b/python/pylibraft/pylibraft/tests/test_config.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_config.py
rename to python/pylibraft/pylibraft/tests/test_config.py
diff --git a/python/pylibraft/pylibraft/test/test_device_ndarray.py b/python/pylibraft/pylibraft/tests/test_device_ndarray.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_device_ndarray.py
rename to python/pylibraft/pylibraft/tests/test_device_ndarray.py
diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/tests/test_doctests.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_doctests.py
rename to python/pylibraft/pylibraft/tests/test_doctests.py
diff --git a/python/pylibraft/pylibraft/test/test_handle.py b/python/pylibraft/pylibraft/tests/test_handle.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_handle.py
rename to python/pylibraft/pylibraft/tests/test_handle.py
diff --git a/python/pylibraft/pylibraft/test/test_mdspan_serializer.py b/python/pylibraft/pylibraft/tests/test_mdspan_serializer.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_mdspan_serializer.py
rename to python/pylibraft/pylibraft/tests/test_mdspan_serializer.py
diff --git a/python/pylibraft/pylibraft/test/test_random.py b/python/pylibraft/pylibraft/tests/test_random.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_random.py
rename to python/pylibraft/pylibraft/tests/test_random.py
diff --git a/python/pylibraft/pylibraft/test/test_sparse.py b/python/pylibraft/pylibraft/tests/test_sparse.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_sparse.py
rename to python/pylibraft/pylibraft/tests/test_sparse.py
diff --git a/python/pylibraft/pylibraft/test/test_version.py b/python/pylibraft/pylibraft/tests/test_version.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_version.py
rename to python/pylibraft/pylibraft/tests/test_version.py
diff --git a/python/pylibraft/pylibraft/test/test_z_interruptible.py b/python/pylibraft/pylibraft/tests/test_z_interruptible.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_z_interruptible.py
rename to python/pylibraft/pylibraft/tests/test_z_interruptible.py
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index bb01602b33..912f1ad947 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -32,12 +32,9 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cuda-python",
+    "libraft==25.2.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
-    "nvidia-cublas",
-    "nvidia-curand",
-    "nvidia-cusolver",
-    "nvidia-cusparse",
-    "rmm==24.12.*,>=0.0.0a0",
+    "rmm==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -124,19 +121,21 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cuda-python",
     "cython>=3.0.0,<3.1.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
     "ninja",
-    "rmm==24.12.*,>=0.0.0a0",
+    "rmm==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.pydistcheck]
 select = [
     "distro-too-large-compressed",
 ]
 
-# detect when package size grows significantly
-max_allowed_size_compressed = '825M'
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
 
 [tool.pytest.ini_options]
 filterwarnings = [
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
deleted file mode 100644
index 7d1a0c9065..0000000000
--- a/python/pylibraft/setup.cfg
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    nvtext
-    cudf
-    cuml
-    cugraph
-    dask_cudf
-    rmm
-known_first_party=
-    raft
-    pylibraft
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
diff --git a/python/raft-dask/.coveragerc b/python/raft-dask/.coveragerc
index 968c4b898a..8077c9ae90 100644
--- a/python/raft-dask/.coveragerc
+++ b/python/raft-dask/.coveragerc
@@ -1,3 +1,3 @@
 # Configuration file for Python coverage tests
 [run]
-source = raft_dask
\ No newline at end of file
+source = raft_dask
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 9ebbaa5298..1fcb40a58d 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -25,38 +25,16 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       OFF
-)
-
 rapids_cpm_init()
 # Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can remove this.
 find_package(ucx REQUIRED)
 include(cmake/thirdparty/get_ucxx.cmake)
 
-# If the user requested it we attempt to find RAFT.
-if(FIND_RAFT_CPP)
-  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
-else()
-  set(raft_FOUND OFF)
-endif()
-
-if(NOT raft_FOUND)
-  # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
-  # library compilation and we don't need to install anything here.
-  set(BUILD_TESTS OFF)
-  set(BUILD_PRIMS_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARIES OFF)
-  set(RAFT_COMPILE_DIST_LIBRARY OFF)
-  set(RAFT_COMPILE_NN_LIBRARY OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-  set(CUDA_STATIC_MATH_LIBRARIES ON)
-  set(RAFT_DASK_UCXX_STATIC ON)
-
-  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
-  list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
-  find_package(NCCL REQUIRED)
-endif()
+# why these components:
+#
+# * 'raft' = the headers, needed to link against libraft
+# * 'distributed' = needed for NCCL
+find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS raft distributed)
 
 include(rapids-cython-core)
 rapids_cython_init()
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
index db9b5c6b4d..e6b9c4aa0e 100644
--- a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -45,11 +45,11 @@ function(find_and_configure_ucxx)
 endfunction()
 
 # Change pinned tag here to test a commit in CI
-# To use a different RAFT locally, set the CMake variable
-# CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_ucxx(VERSION  0.41
+# To use a different ucxx locally, set the CMake variable
+# CPM_ucxx_SOURCE=/path/to/local/ucxx
+find_and_configure_ucxx(VERSION  0.42
         FORK             rapidsai
-        PINNED_TAG       branch-0.41
+        PINNED_TAG       branch-0.42
         EXCLUDE_FROM_ALL YES
         UCXX_STATIC      ${RAFT_DASK_UCXX_STATIC}
     )
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index a9f4de5dc3..d3a26db282 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -31,13 +31,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "dask-cuda==24.12.*,>=0.0.0a0",
-    "distributed-ucxx==0.41.*,>=0.0.0a0",
-    "joblib>=0.11",
-    "numba>=0.57",
-    "pylibraft==24.12.*,>=0.0.0a0",
-    "rapids-dask-dependency==24.12.*,>=0.0.0a0",
-    "ucx-py==0.41.*,>=0.0.0a0",
+    "dask-cuda==25.2.*,>=0.0.0a0",
+    "distributed-ucxx==0.42.*,>=0.0.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "pylibraft==25.2.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.2.*,>=0.0.0a0",
+    "ucx-py==0.42.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -121,6 +120,8 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.0,<3.1.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
     "libucx==1.15.0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 19a037ae75..78248fad7a 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 #
 
-from raft_dask._version import __git_commit__, __version__
-
 # If libucx was installed as a wheel, we must request it to load the library symbols.
 # Otherwise, we assume that the library was installed in a system path that ld can find.
 try:
@@ -24,3 +22,16 @@
 else:
     libucx.load_library()
     del libucx
+
+# If libraft was installed as a wheel, we must request it to load the library
+# symbols. Otherwise, we assume that the library was installed in a system path that ld
+# can find.
+try:
+    import libraft
+except ModuleNotFoundError:
+    pass
+else:
+    libraft.load_library()
+    del libraft
+
+from raft_dask._version import __git_commit__, __version__
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 65d5f06577..1279d5d501 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -15,6 +15,5 @@
 set(cython_sources comms_utils.pyx nccl.pyx)
 set(linked_libraries raft::raft raft::distributed)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
-                                                                            CXX
+  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX
 )
diff --git a/python/raft-dask/raft_dask/include_test/CMakeLists.txt b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
index 2ff1cd9150..8839c57b91 100644
--- a/python/raft-dask/raft_dask/include_test/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
@@ -15,6 +15,5 @@
 set(cython_sources raft_include_test.pyx)
 set(linked_libraries raft::raft)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
-                                                                            CXX
+  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX
 )
diff --git a/python/raft-dask/raft_dask/test/conftest.py b/python/raft-dask/raft_dask/tests/conftest.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/conftest.py
rename to python/raft-dask/raft_dask/tests/conftest.py
diff --git a/python/raft-dask/raft_dask/test/pytest.ini b/python/raft-dask/raft_dask/tests/pytest.ini
similarity index 98%
rename from python/raft-dask/raft_dask/test/pytest.ini
rename to python/raft-dask/raft_dask/tests/pytest.ini
index bf70c06f84..7b0a9f29fb 100644
--- a/python/raft-dask/raft_dask/test/pytest.ini
+++ b/python/raft-dask/raft_dask/tests/pytest.ini
@@ -2,4 +2,3 @@
 
 [pytest]
 addopts = --tb=native
-
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/tests/test_comms.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/test_comms.py
rename to python/raft-dask/raft_dask/tests/test_comms.py
diff --git a/python/raft-dask/raft_dask/test/test_raft.py b/python/raft-dask/raft_dask/tests/test_raft.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/test_raft.py
rename to python/raft-dask/raft_dask/tests/test_raft.py
diff --git a/python/raft-dask/raft_dask/test/test_version.py b/python/raft-dask/raft_dask/tests/test_version.py
similarity index 100%
rename from python/raft-dask/raft_dask/test/test_version.py
rename to python/raft-dask/raft_dask/tests/test_version.py
diff --git a/rapids_config.cmake b/rapids_config.cmake
index c8077f7f4b..a40d7130c0 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -22,13 +22,15 @@ else()
   string(REPLACE "\n" "\n  " _rapids_version_formatted "  ${_rapids_version}")
   message(
     FATAL_ERROR
-      "Could not determine RAPIDS version. Contents of VERSION file:\n${_rapids_version_formatted}")
+      "Could not determine RAPIDS version. Contents of VERSION file:\n${_rapids_version_formatted}"
+  )
 endif()
 
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
   file(
     DOWNLOAD
     "https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/RAPIDS.cmake"
-    "${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
+    "${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake"
+  )
 endif()
 include("${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 94140d4d00..0000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-exclude = __init__.py, *.egg, build, docs, .git
-force-check = True
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
-
-[pydocstyle]
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
-# than include using match-dir. Note that as discussed in
-# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
-# unlike the match option above this match-dir will have no effect when
-# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
-# also be maintained in the pre-commit config file.
-match-dir = ^(?!(ci|cpp|conda|docs)).*$
-# Allow missing docstrings for docutils
-ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
-select =
-    D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418
-    # Would like to enable the following rules in the future:
-    # D200, D202, D205, D400
-
-[mypy]
-ignore_missing_imports = True
-# If we don't specify this, then mypy will check excluded files if
-# they are imported by a checked file.
-follow_imports = skip
-
-[codespell]
-# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
-# this is only to allow you to run codespell interactively
-skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild
-# ignore short words, and typename parameters like OffsetT
-ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b
-ignore-words-list = inout,unparseable,numer
-builtin = clear
-quiet-level = 3
diff --git a/thirdparty/LICENSES/LICENSE.ann-benchmark b/thirdparty/LICENSES/LICENSE.ann-benchmark
index 9f8e4222f6..4d04745ab4 100644
--- a/thirdparty/LICENSES/LICENSE.ann-benchmark
+++ b/thirdparty/LICENSES/LICENSE.ann-benchmark
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/thirdparty/LICENSES/LICENSE.faiss b/thirdparty/LICENSES/LICENSE.faiss
index 87cbf536c6..b96dcb0480 100644
--- a/thirdparty/LICENSES/LICENSE.faiss
+++ b/thirdparty/LICENSES/LICENSE.faiss
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/thirdparty/LICENSES/LICENSE.pytorch b/thirdparty/LICENSES/LICENSE.pytorch
index 7ad3d737a5..04f9ad1105 100644
--- a/thirdparty/LICENSES/LICENSE.pytorch
+++ b/thirdparty/LICENSES/LICENSE.pytorch
@@ -74,4 +74,4 @@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/LICENSES/mdarray.license b/thirdparty/LICENSES/mdarray.license
index e636b86032..5a491b0879 100644
--- a/thirdparty/LICENSES/mdarray.license
+++ b/thirdparty/LICENSES/mdarray.license
@@ -39,4 +39,4 @@
 //
 // ************************************************************************
 //@HEADER
-*/
\ No newline at end of file
+*/