diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97f..594ba8c3c4 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -5,6 +5,13 @@ ARG PYTHON_PACKAGE_MANAGER=conda
 
 FROM ${BASE} as pip-base
 
+RUN apt update -y \
+ && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \
+    # faiss dependencies
+    libblas-dev \
+    liblapack-dev \
+ && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
+
 ENV DEFAULT_VIRTUAL_ENV=rapids
 
 FROM ${BASE} as conda-base
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 2682510ed1..536537f07f 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index de039eeb11..92e7613a9b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,22 +5,27 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
-      "version": "1.14.1"
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
+      "version": "1.15.0"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 4b24d94dd1..948680eaf6 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,12 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 489546cb21..cd287569d8 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,22 +5,27 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
-      "version": "1.14.1"
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
+      "version": "1.15.0"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index fc4fcd458b..d1cc52592c 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -11,11 +11,14 @@ python/setup.py    @rapidsai/raft-cmake-codeowners
 build.sh           @rapidsai/raft-cmake-codeowners
 **/build.sh        @rapidsai/raft-cmake-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-docker/            @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainers/   @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bd8b13d21e..e013d4f1c5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -87,9 +87,8 @@ jobs:
       date: ${{ inputs.date }}
       package-name: pylibraft
   wheel-build-raft-dask:
-    needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ada46141a7..c2d9556859 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@fix/devcontainer-json-location
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2a557a8b84..18094cc05a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: _ZN\d+raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a4da6197e..e0599dae8a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,82 @@
+# raft 24.06.00 (5 Jun 2024)
+
+## 🚨 Breaking Changes
+
+- Rename raft-ann-bench module to raft_ann_bench ([#2333](https://github.com/rapidsai/raft/pull/2333)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Scaling workspace resources ([#2322](https://github.com/rapidsai/raft/pull/2322)) [@achirkin](https://github.com/achirkin)
+- [REVIEW] Adjust UCX dependencies ([#2304](https://github.com/rapidsai/raft/pull/2304)) [@pentschev](https://github.com/pentschev)
+- Convert device_memory_resource* to device_async_resource_ref ([#2269](https://github.com/rapidsai/raft/pull/2269)) [@harrism](https://github.com/harrism)
+
+## 🐛 Bug Fixes
+
+- Fix import of VERSION file in raft-ann-bench ([#2338](https://github.com/rapidsai/raft/pull/2338)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Rename raft-ann-bench module to raft_ann_bench ([#2333](https://github.com/rapidsai/raft/pull/2333)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Support building faiss main statically ([#2323](https://github.com/rapidsai/raft/pull/2323)) [@robertmaynard](https://github.com/robertmaynard)
+- Refactor spectral scale_obs to use existing normalization function ([#2319](https://github.com/rapidsai/raft/pull/2319)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Correct initializer list order found by cuvs ([#2317](https://github.com/rapidsai/raft/pull/2317)) [@robertmaynard](https://github.com/robertmaynard)
+- ANN_BENCH: enable move semantics for configured_raft_resources ([#2311](https://github.com/rapidsai/raft/pull/2311)) [@achirkin](https://github.com/achirkin)
+- Revert &quot;Build C++ wheel ([#2264)&quot; (#2305](https://github.com/rapidsai/raft/pull/2264)&quot; (#2305)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Add `compile-library` by default on pylibraft build&quot; ([#2300](https://github.com/rapidsai/raft/pull/2300)) [@vyasr](https://github.com/vyasr)
+- Add VERSION to raft-ann-bench package ([#2299](https://github.com/rapidsai/raft/pull/2299)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove nonexistent job from workflow ([#2298](https://github.com/rapidsai/raft/pull/2298)) [@vyasr](https://github.com/vyasr)
+- `libucx` should be run dependency of `raft-dask` ([#2296](https://github.com/rapidsai/raft/pull/2296)) [@divyegala](https://github.com/divyegala)
+- Fix clang intrinsic warning ([#2292](https://github.com/rapidsai/raft/pull/2292)) [@aaronmondal](https://github.com/aaronmondal)
+- Replace too long index file name with hash in ANN bench ([#2280](https://github.com/rapidsai/raft/pull/2280)) [@tfeher](https://github.com/tfeher)
+- Fix build command for C++ compilation ([#2270](https://github.com/rapidsai/raft/pull/2270)) [@lowener](https://github.com/lowener)
+- Fix a compilation error in CAGRA when enabling log output ([#2262](https://github.com/rapidsai/raft/pull/2262)) [@enp1s0](https://github.com/enp1s0)
+- Correct member initialization order ([#2254](https://github.com/rapidsai/raft/pull/2254)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix time computation in CAGRA notebook ([#2231](https://github.com/rapidsai/raft/pull/2231)) [@lowener](https://github.com/lowener)
+
+## 📖 Documentation
+
+- Fix citation info ([#2318](https://github.com/rapidsai/raft/pull/2318)) [@enp1s0](https://github.com/enp1s0)
+
+## 🚀 New Features
+
+- Scaling workspace resources ([#2322](https://github.com/rapidsai/raft/pull/2322)) [@achirkin](https://github.com/achirkin)
+- ANN_BENCH: AnnGPU::uses_stream() for optional algo GPU sync ([#2314](https://github.com/rapidsai/raft/pull/2314)) [@achirkin](https://github.com/achirkin)
+- [FEA] Split Bitset code ([#2295](https://github.com/rapidsai/raft/pull/2295)) [@lowener](https://github.com/lowener)
+- [FEA] support of prefiltered brute force ([#2294](https://github.com/rapidsai/raft/pull/2294)) [@rhdong](https://github.com/rhdong)
+- Always use a static gtest and gbench ([#2265](https://github.com/rapidsai/raft/pull/2265)) [@robertmaynard](https://github.com/robertmaynard)
+- Build C++ wheel ([#2264](https://github.com/rapidsai/raft/pull/2264)) [@vyasr](https://github.com/vyasr)
+- InnerProduct Distance Metric for CAGRA search ([#2260](https://github.com/rapidsai/raft/pull/2260)) [@tarang-jain](https://github.com/tarang-jain)
+- [FEA] Add support for `select_k` on CSR matrix ([#2140](https://github.com/rapidsai/raft/pull/2140)) [@rhdong](https://github.com/rhdong)
+
+## 🛠️ Improvements
+
+- ANN_BENCH: common AnnBase::index_type ([#2315](https://github.com/rapidsai/raft/pull/2315)) [@achirkin](https://github.com/achirkin)
+- ANN_BENCH: split instances of RaftCagra into multiple files ([#2313](https://github.com/rapidsai/raft/pull/2313)) [@achirkin](https://github.com/achirkin)
+- ANN_BENCH: a global pool of result buffers across benchmark cases ([#2312](https://github.com/rapidsai/raft/pull/2312)) [@achirkin](https://github.com/achirkin)
+- Remove the shared state and the mutex from NVTX internals ([#2310](https://github.com/rapidsai/raft/pull/2310)) [@achirkin](https://github.com/achirkin)
+- docs: update README.md ([#2308](https://github.com/rapidsai/raft/pull/2308)) [@eltociear](https://github.com/eltociear)
+- [REVIEW] Reenable raft-dask wheel tests requiring UCX-Py ([#2307](https://github.com/rapidsai/raft/pull/2307)) [@pentschev](https://github.com/pentschev)
+- [REVIEW] Adjust UCX dependencies ([#2304](https://github.com/rapidsai/raft/pull/2304)) [@pentschev](https://github.com/pentschev)
+- Overhaul ops-codeowners ([#2303](https://github.com/rapidsai/raft/pull/2303)) [@raydouglass](https://github.com/raydouglass)
+- Make thrust nosync execution policy the default thrust policy ([#2302](https://github.com/rapidsai/raft/pull/2302)) [@abc99lr](https://github.com/abc99lr)
+- InnerProduct testing for CAGRA+HNSW ([#2297](https://github.com/rapidsai/raft/pull/2297)) [@divyegala](https://github.com/divyegala)
+- Enable warnings as errors for Python tests ([#2288](https://github.com/rapidsai/raft/pull/2288)) [@mroeschke](https://github.com/mroeschke)
+- Normalize dataset vectors in the CAGRA InnerProduct tests ([#2287](https://github.com/rapidsai/raft/pull/2287)) [@enp1s0](https://github.com/enp1s0)
+- Use dynamic version for raft-ann-bench ([#2285](https://github.com/rapidsai/raft/pull/2285)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Make &#39;librmm&#39; a &#39;host&#39; dependency for conda packages ([#2284](https://github.com/rapidsai/raft/pull/2284)) [@jameslamb](https://github.com/jameslamb)
+- Fix comments in cpp/include/raft/neighbors/cagra_serialize.cuh ([#2283](https://github.com/rapidsai/raft/pull/2283)) [@jiangyinzuo](https://github.com/jiangyinzuo)
+- Only use functions in the limited API ([#2282](https://github.com/rapidsai/raft/pull/2282)) [@vyasr](https://github.com/vyasr)
+- define &#39;ucx&#39; pytest marker ([#2281](https://github.com/rapidsai/raft/pull/2281)) [@jameslamb](https://github.com/jameslamb)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#2278](https://github.com/rapidsai/raft/pull/2278)) [@hcho3](https://github.com/hcho3)
+- add --rm and --name to devcontainer run args ([#2275](https://github.com/rapidsai/raft/pull/2275)) [@trxcllnt](https://github.com/trxcllnt)
+- Update pip devcontainers to UCX v1.15.0 ([#2274](https://github.com/rapidsai/raft/pull/2274)) [@trxcllnt](https://github.com/trxcllnt)
+- `#ifdef` out pragma deprecation warning messages ([#2271](https://github.com/rapidsai/raft/pull/2271)) [@trxcllnt](https://github.com/trxcllnt)
+- Convert device_memory_resource* to device_async_resource_ref ([#2269](https://github.com/rapidsai/raft/pull/2269)) [@harrism](https://github.com/harrism)
+- Update the developer&#39;s guide with new copyright hook ([#2266](https://github.com/rapidsai/raft/pull/2266)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve coalesced reduction performance for tall and thin matrices (up to 2.6x faster) ([#2259](https://github.com/rapidsai/raft/pull/2259)) [@Nyrio](https://github.com/Nyrio)
+- Adds missing files to `update-version.sh` ([#2255](https://github.com/rapidsai/raft/pull/2255)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Enable all tests for `arm64` jobs ([#2248](https://github.com/rapidsai/raft/pull/2248)) [@galipremsagar](https://github.com/galipremsagar)
+- Update nvtx3 link in cmake ([#2246](https://github.com/rapidsai/raft/pull/2246)) [@lowener](https://github.com/lowener)
+- Add CAGRA-Q subspace dim = 4 support ([#2244](https://github.com/rapidsai/raft/pull/2244)) [@enp1s0](https://github.com/enp1s0)
+- Get rid of `cuco::sentinel` namespace ([#2243](https://github.com/rapidsai/raft/pull/2243)) [@PointKernel](https://github.com/PointKernel)
+- Replace usages of raw `get_upstream` with `get_upstream_resource()` ([#2207](https://github.com/rapidsai/raft/pull/2207)) [@miscco](https://github.com/miscco)
+- Set the import mode for dask tests ([#2142](https://github.com/rapidsai/raft/pull/2142)) [@vyasr](https://github.com/vyasr)
+- Add UCXX support ([#1983](https://github.com/rapidsai/raft/pull/1983)) [@pentschev](https://github.com/pentschev)
+
 # raft 24.04.00 (10 Apr 2024)
 
 ## 🐛 Bug Fixes
diff --git a/README.md b/README.md
index 7833a5cfa3..fc56859557 100755
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 - [RAFT Reference Documentation](https://docs.rapids.ai/api/raft/stable/): API Documentation.
 - [RAFT Getting Started](./docs/source/quick_start.md): Getting started with RAFT.
 - [Build and Install RAFT](./docs/source/build.md): Instructions for installing and building RAFT.
-- [Example Notebooks](./notebooks): Example jupyer notebooks
+- [Example Notebooks](./notebooks): Example jupyter notebooks
 - [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
 - [GitHub repository](https://github.com/rapidsai/raft): Download the RAFT source code.
 - [Issue tracker](https://github.com/rapidsai/raft/issues): Report issues or request features.
@@ -293,7 +293,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
@@ -354,10 +354,8 @@ If citing CAGRA, please consider the following bibtex:
 @misc{ootomo2023cagra,
       title={CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search for GPUs},
       author={Hiroyuki Ootomo and Akira Naruse and Corey Nolet and Ray Wang and Tamas Feher and Yong Wang},
-      year={2023},
-      eprint={2308.15136},
-      archivePrefix={arXiv},
-      primaryClass={cs.DS}
+      year={2024},
+      series = {ICDE '24}
 }
 ```
 
@@ -365,13 +363,14 @@ If citing the k-selection routines, please consider the following bibtex:
 
 ```bibtex
 @proceedings{10.1145/3581784,
-    title = {SC '23: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
+    title = {Parallel Top-K Algorithms on GPU: A Comprehensive Study and New Methods},
+    author={Jingrong Zhang, Akira Naruse, Xipeng Li, and Yong Wang},
     year = {2023},
     isbn = {9798400701092},
     publisher = {Association for Computing Machinery},
     address = {New York, NY, USA},
-    abstract = {Started in 1988, the SC Conference has become the annual nexus for researchers and practitioners from academia, industry and government to share information and foster collaborations to advance the state of the art in High Performance Computing (HPC), Networking, Storage, and Analysis.},
-    location = {, Denver, CO, USA, }
+    location = {Denver, CO, USA}
+    series = {SC '23}
 }
 ```
 
@@ -394,4 +393,4 @@ If citing the nearest neighbors descent API, please consider the following bibte
     location = {Virtual Event, Queensland, Australia},
     series = {CIKM '21}
 }
-```
\ No newline at end of file
+```
diff --git a/VERSION b/VERSION
index 4a2fe8aa57..0bff6981a3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.06.00
diff --git a/build.sh b/build.sh
index 45c7d1380f..148d23c9c1 100755
--- a/build.sh
+++ b/build.sh
@@ -305,7 +305,7 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
-if hasArg --compile-lib || hasArg pylibraft || (( ${NUMARGS} == 0 )); then
+if hasArg --compile-lib || (( ${NUMARGS} == 0 )); then
     COMPILE_LIBRARY=ON
     CMAKE_TARGET="${CMAKE_TARGET};raft_lib"
 fi
@@ -405,7 +405,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann || ((${COMPILE_LIBRARY} == ON )); then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 5d06e46303..e3e7ce9c89 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -7,6 +7,10 @@ package_name=$1
 package_dir=$2
 underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
+# Clear out system ucx files to ensure that we're getting ucx from the wheel.
+rm -rf /usr/lib64/ucx
+rm -rf /usr/lib64/libuc*
+
 source rapids-configure-sccache
 source rapids-date-string
 
@@ -38,9 +42,11 @@ fi
 
 if [[ ${package_name} == "raft-dask" ]]; then
     sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
 fi
@@ -56,6 +62,6 @@ cd "${package_dir}"
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
+python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*
 
 RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
index ec30a28b92..895c311f46 100755
--- a/ci/build_wheel_pylibraft.sh
+++ b/ci/build_wheel_pylibraft.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
index 5ae12303d0..feba2d7a5b 100755
--- a/ci/build_wheel_raft_dask.sh
+++ b/ci/build_wheel_raft_dask.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 636f637d0c..9554a7dde8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -37,6 +37,8 @@ function sed_runner() {
 }
 
 sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake
+sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION  ${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
@@ -50,7 +52,7 @@ DEPENDENCIES=(
   rmm-cu11
   rmm-cu12
   rapids-dask-dependency
-  # ucx-py is handled separately below
+  # ucx-py and ucxx are handled separately below
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
@@ -59,6 +61,10 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do
   sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
   sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
   sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
 done
 for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
@@ -68,6 +74,7 @@ for FILE in python/*/pyproject.toml; do
 done
 
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+sed_runner "/^ucxx_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
@@ -85,5 +92,7 @@ sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" README.md
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-${CURRENT_SHORT_TAG}@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
diff --git a/ci/run_raft_dask_pytests.sh b/ci/run_raft_dask_pytests.sh
index 46cd211d2e..07d0b5baa0 100755
--- a/ci/run_raft_dask_pytests.sh
+++ b/ci/run_raft_dask_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_raft_dask_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/raft-dask/raft_dask
 
-pytest --cache-clear "$@" test
+pytest --cache-clear --import-mode=append "$@" test
diff --git a/ci/test_python.sh b/ci/test_python.sh
index f5b188ca0b..59da1f0bc4 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -59,5 +59,23 @@ rapids-logger "pytest raft-dask"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-coverage.xml" \
   --cov-report=term
 
+rapids-logger "pytest raft-dask (ucx-py only)"
+./ci/run_raft_dask_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask-ucx.xml" \
+  --cov-config=../.coveragerc \
+  --cov=raft_dask \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-ucx-coverage.xml" \
+  --cov-report=term \
+  --run_ucx
+
+rapids-logger "pytest raft-dask (ucxx only)"
+./ci/run_raft_dask_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask-ucxx.xml" \
+  --cov-config=../.coveragerc \
+  --cov=raft_dask \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-ucxx-coverage.xml" \
+  --cov-report=term \
+  --run_ucxx
+
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index d990a0e6c2..b38f5a690b 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -10,9 +10,4 @@ RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/pylibraft*.whl)[test]
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
-    python ./ci/wheel_smoke_test_pylibraft.py
-else
-    python -m pytest ./python/pylibraft/pylibraft/test
-fi
+python -m pytest ./python/pylibraft/pylibraft/test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index b70563b7a1..bd531e7e85 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -11,12 +11,15 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/raft_dask*.whl)[test]
+python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
-    python ./ci/wheel_smoke_test_raft_dask.py
-else
-    python -m pytest ./python/raft-dask/raft_dask/test
-fi
+test_dir="python/raft-dask/raft_dask/test"
+
+rapids-logger "pytest raft-dask"
+python -m pytest --import-mode=append ${test_dir}
+
+rapids-logger "pytest raft-dask (ucx-py only)"
+python -m pytest --import-mode=append ${test_dir} --run_ucx
+
+rapids-logger "pytest raft-dask (ucxx only)"
+python -m pytest --import-mode=append ${test_dir} --run_ucxx
diff --git a/ci/wheel_smoke_test_pylibraft.py b/ci/wheel_smoke_test_pylibraft.py
deleted file mode 100644
index c0df2fe45c..0000000000
--- a/ci/wheel_smoke_test_pylibraft.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-from scipy.spatial.distance import cdist
-
-from pylibraft.common import Handle, Stream, device_ndarray
-from pylibraft.distance import pairwise_distance
-
-
-if __name__ == "__main__":
-    metric = "euclidean"
-    n_rows = 1337
-    n_cols = 1337
-
-    input1 = np.random.random_sample((n_rows, n_cols))
-    input1 = np.asarray(input1, order="C").astype(np.float64)
-
-    output = np.zeros((n_rows, n_rows), dtype=np.float64)
-
-    expected = cdist(input1, input1, metric)
-
-    expected[expected <= 1e-5] = 0.0
-
-    input1_device = device_ndarray(input1)
-    output_device = None
-
-    s2 = Stream()
-    handle = Handle(stream=s2)
-    ret_output = pairwise_distance(
-        input1_device, input1_device, output_device, metric, handle=handle
-    )
-    handle.sync()
-
-    output_device = ret_output
-
-    actual = output_device.copy_to_host()
-
-    actual[actual <= 1e-5] = 0.0
-
-    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/ci/wheel_smoke_test_raft_dask.py b/ci/wheel_smoke_test_raft_dask.py
deleted file mode 100644
index 5709ac901c..0000000000
--- a/ci/wheel_smoke_test_raft_dask.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from dask.distributed import Client, get_worker, wait
-from dask_cuda import LocalCUDACluster, initialize
-
-from raft_dask.common import (
-    Comms,
-    local_handle,
-    perform_test_comm_split,
-    perform_test_comms_allgather,
-    perform_test_comms_allreduce,
-    perform_test_comms_bcast,
-    perform_test_comms_device_multicast_sendrecv,
-    perform_test_comms_device_send_or_recv,
-    perform_test_comms_device_sendrecv,
-    perform_test_comms_gather,
-    perform_test_comms_gatherv,
-    perform_test_comms_reduce,
-    perform_test_comms_reducescatter,
-    perform_test_comms_send_recv,
-)
-
-import os
-os.environ["UCX_LOG_LEVEL"] = "error"
-
-
-def func_test_send_recv(sessionId, n_trials):
-    handle = local_handle(sessionId, dask_worker=get_worker())
-    return perform_test_comms_send_recv(handle, n_trials)
-
-
-def func_test_collective(func, sessionId, root):
-    handle = local_handle(sessionId, dask_worker=get_worker())
-    return func(handle, root)
-
-
-if __name__ == "__main__":
-    # initial setup
-    cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
-    client = Client(cluster)
-
-    n_trials = 5
-    root_location = "client"
-
-    # p2p test for ucx
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_send_recv,
-            cb.sessionId,
-            n_trials,
-            pure=False,
-            workers=[w],
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert list(map(lambda x: x.result(), dfs))
-
-    cb.destroy()
-
-    # collectives test for nccl
-
-    cb = Comms(
-        verbose=True, client=client, nccl_root_location=root_location
-    )
-    cb.init()
-
-    for k, v in cb.worker_info(cb.worker_addresses).items():
-
-        dfs = [
-            client.submit(
-                func_test_collective,
-                perform_test_comms_allgather,
-                cb.sessionId,
-                v["rank"],
-                pure=False,
-                workers=[w],
-            )
-            for w in cb.worker_addresses
-        ]
-        wait(dfs, timeout=5)
-
-        assert all([x.result() for x in dfs])
-
-    cb.destroy()
-
-    # final client and cluster teardown
-    client.close()
-    cluster.close()
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index e27532a489..590c3eb68b 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,12 +20,11 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev=11.11.3.6
@@ -36,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -46,16 +46,14 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucx>=1.15.0,<1.16.0
+- ucx-py==0.38.*
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bf535c5c04..00ed8fa65e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,12 +20,11 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev=11.11.3.6
@@ -36,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -46,16 +46,14 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucx>=1.15.0,<1.16.0
+- ucx-py==0.38.*
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
index 8ea3843841..f1f346706d 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -21,18 +21,18 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -42,16 +42,14 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucx>=1.15.0,<1.16.0
+- ucx-py==0.38.*
 name: all_cuda-122_arch-aarch64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index a3f6f7e99f..505a4f1a97 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -21,18 +21,18 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
@@ -42,16 +42,14 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-proc=*=gpu
-- ucx-py==0.37.*
-- ucx>=1.15.0,<1.16.0
+- ucx-py==0.38.*
 name: all_cuda-122_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 0e0385ceeb..7315f82c13 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -38,7 +39,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index dfe76a2948..ff973acc0c 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -38,7 +39,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 0a6567c646..056550fc07 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -27,6 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -34,7 +35,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index a89d5317b6..41a48f4a12 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -27,6 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
@@ -34,7 +35,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 9c39da4507..bb9c715e3a 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
@@ -19,12 +22,6 @@ cmake_version:
 nccl_version:
   - ">=2.9.9"
 
-gbench_version:
-  - "==1.8.0"
-
-gtest_version:
-  - ">=1.13.0"
-
 glog_version:
   - ">=0.6.0"
 
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 55f326dc53..a075308500 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -58,12 +58,13 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - cuda-version ={{ cuda_version }}
         {% if cuda_major != "11" %}
         - cuda-cudart-dev
         {% endif %}
+        - librmm ={{ minor_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
@@ -93,6 +94,7 @@ outputs:
     requirements:
       host:
         - cuda-version ={{ cuda_version }}
+        - librmm ={{ minor_version }}
       run:
         - {{ pin_subpackage('libraft-headers-only', exact=True) }}
         - librmm ={{ minor_version }}
@@ -150,7 +152,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - cuda-version ={{ cuda_version }}
@@ -212,7 +214,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - cuda-version ={{ cuda_version }}
@@ -278,7 +280,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         # We must include both libraft and libraft-static to prevent the test
         # builds from packaging those libraries. However, tests only depend on
@@ -304,9 +306,6 @@ outputs:
         - libcusolver-dev
         - libcusparse-dev
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
@@ -319,9 +318,6 @@ outputs:
         - libcusparse
         {% endif %}
         - {{ pin_subpackage('libraft', exact=True) }}
-        - benchmark {{ gbench_version }}
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
@@ -353,7 +349,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
index e28b98da7f..e3ca633eb9 100644
--- a/conda/recipes/pylibraft/conda_build_config.yaml
+++ b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index e524a68f9e..cbeaec3b55 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -39,7 +39,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
     - cuda-python >=11.7.1,<12.0a0
diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
index 93a5532962..4de3b98f48 100644
--- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml
index fce85d5ffc..d0748fdb16 100644
--- a/conda/recipes/raft-ann-bench-cpu/meta.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 # Usage:
 #   conda build . -c conda-forge -c nvidia -c rapidsai
@@ -42,7 +42,7 @@ requirements:
     - {{ compiler('cxx') }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
 
   host:
     - glog {{ glog_version }}
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index da0b893c1d..cf025a06a4 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
@@ -19,9 +22,6 @@ cmake_version:
 nccl_version:
   - ">=2.9.9"
 
-gtest_version:
-  - ">=1.13.0"
-
 glog_version:
   - ">=0.6.0"
 
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
index ec24501475..8a6a3d033d 100644
--- a/conda/recipes/raft-ann-bench/meta.yaml
+++ b/conda/recipes/raft-ann-bench/meta.yaml
@@ -57,7 +57,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
 
   host:
     - python
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index d2bdcbb351..b157e41753 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -10,14 +10,17 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
-  - "2.17"
+c_stdlib:
+  - sysroot
 
-ucx_version:
-  - ">=1.15.0,<1.16.0"
+c_stdlib_version:
+  - "2.17"
 
 ucx_py_version:
-  - "0.37.*"
+  - "0.38.*"
+
+ucxx_version:
+  - "0.38.*"
 
 cmake_version:
   - ">=3.26.4"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 6910905d07..af22c8853e 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -39,7 +39,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
     - cuda-python >=11.7.1,<12.0a0
@@ -56,9 +56,8 @@ requirements:
     - rmm ={{ minor_version }}
     - scikit-build-core >=0.7.0
     - setuptools
-    - ucx {{ ucx_version }}
-    - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
+    - ucxx {{ ucxx_version }}
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
@@ -73,9 +72,8 @@ requirements:
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - ucx {{ ucx_version }}
-    - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
+    - distributed-ucxx {{ ucxx_version }}
 
 tests:
   requirements:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cbae4bfb3f..39472cae67 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -185,12 +185,13 @@ if(NOT BUILD_CPU_ONLY)
 endif()
 
 if(BUILD_TESTS)
-  include(cmake/thirdparty/get_gtest.cmake)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
 endif()
 
 if(BUILD_PRIMS_BENCH OR BUILD_ANN_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 endif()
 
 if(BUILD_CAGRA_HNSWLIB)
@@ -274,7 +275,7 @@ else()
       "\" OFF)"
       [=[
 
-target_link_libraries(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:CUDA::nvToolsExt>)
+target_link_libraries(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:CUDA::nvtx3>)
 target_compile_definitions(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:NVTX_ENABLED>)
 
   ]=]
@@ -564,7 +565,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
-    src/util/memory_pool.cpp
   )
   set_target_properties(
     raft_objs
@@ -650,12 +650,21 @@ rapids_find_generate_module(
   INSTALL_EXPORT_SET raft-distributed-exports
 )
 
-rapids_export_package(BUILD ucx raft-distributed-exports)
-rapids_export_package(INSTALL ucx raft-distributed-exports)
+rapids_export_package(
+  BUILD ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx ucxx::python
+)
+rapids_export_package(
+  INSTALL ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx
+                                                                              ucxx::python
+)
 rapids_export_package(BUILD NCCL raft-distributed-exports)
 rapids_export_package(INSTALL NCCL raft-distributed-exports)
 
-target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL)
+# ucx is a requirement for raft_distributed, but its config is not safe to be found multiple times,
+# so rather than exporting a package dependency on it above we rely on consumers to find it
+# themselves. Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can export it above
+# again.
+target_link_libraries(raft_distributed INTERFACE ucx::ucp ucxx::ucxx NCCL::NCCL)
 
 # ##################################################################################################
 # * install targets-----------------------------------------------------------
@@ -816,26 +825,26 @@ rapids_export(
 # * shared test/bench headers ------------------------------------------------
 
 if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
-  include(internal/CMakeLists.txt)
+  add_subdirectory(internal)
 endif()
 
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
 if(BUILD_TESTS)
-  include(test/CMakeLists.txt)
+  add_subdirectory(test)
 endif()
 
 # ##################################################################################################
 # * build benchmark executable -----------------------------------------------
 
 if(BUILD_PRIMS_BENCH)
-  include(bench/prims/CMakeLists.txt)
+  add_subdirectory(bench/prims/)
 endif()
 
 # ##################################################################################################
 # * build ann benchmark executable -----------------------------------------------
 
 if(BUILD_ANN_BENCH)
-  include(bench/ann/CMakeLists.txt)
+  add_subdirectory(bench/ann/)
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index ee84f7515a..f489cc62c6 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -12,6 +12,8 @@
 # the License.
 # =============================================================================
 
+list(APPEND CMAKE_MODULE_PATH "${RAFT_SOURCE_DIR}")
+
 # ##################################################################################################
 # * benchmark options ------------------------------------------------------------------------------
 
@@ -40,48 +42,26 @@ option(RAFT_ANN_BENCH_SINGLE_EXE
 
 find_package(Threads REQUIRED)
 
+set(RAFT_ANN_BENCH_USE_FAISS ON)
+set(RAFT_FAISS_ENABLE_GPU ON)
+set(RAFT_USE_FAISS_STATIC ON)
+
 if(BUILD_CPU_ONLY)
 
   # Include necessary logging dependencies
-  include(cmake/thirdparty/get_fmt.cmake)
-  include(cmake/thirdparty/get_spdlog.cmake)
-
+  include(cmake/thirdparty/get_fmt)
+  include(cmake/thirdparty/get_spdlog)
   set(RAFT_FAISS_ENABLE_GPU OFF)
-  set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
-  set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
-  set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
   set(RAFT_ANN_BENCH_USE_GGNN OFF)
-else()
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
   # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
   # https://github.com/rapidsai/raft/issues/1627
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
-    set(RAFT_FAISS_ENABLE_GPU OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT OFF)
-  else()
-    set(RAFT_FAISS_ENABLE_GPU ON)
-  endif()
-endif()
-
-set(RAFT_ANN_BENCH_USE_FAISS OFF)
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT
-   OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ
-   OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT
-   OR RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT
-   OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ
-   OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT
-)
-  set(RAFT_ANN_BENCH_USE_FAISS ON)
-  set(RAFT_USE_FAISS_STATIC ON)
+  set(RAFT_FAISS_ENABLE_GPU OFF)
 endif()
 
 set(RAFT_ANN_BENCH_USE_RAFT OFF)
@@ -98,21 +78,17 @@ endif()
 # * Fetch requirements -------------------------------------------------------------
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
-  include(cmake/thirdparty/get_hnswlib.cmake)
+  include(cmake/thirdparty/get_hnswlib)
 endif()
 
-include(cmake/thirdparty/get_nlohmann_json.cmake)
+include(cmake/thirdparty/get_nlohmann_json)
 
 if(RAFT_ANN_BENCH_USE_GGNN)
-  include(cmake/thirdparty/get_ggnn.cmake)
+  include(cmake/thirdparty/get_ggnn)
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS)
-  # We need to ensure that faiss has all the conda information. So we currently use the very ugly
-  # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory
-  # will have the conda includes/link dirs
-  link_libraries($<TARGET_NAME_IF_EXISTS:conda_env>)
-  include(cmake/thirdparty/get_faiss.cmake)
+  include(cmake/thirdparty/get_faiss)
 endif()
 
 # ##################################################################################################
@@ -173,8 +149,6 @@ function(ConfigureAnnBench)
             $<$<BOOL:${GPU_BUILD}>:${RAFT_CTK_MATH_DEPENDENCIES}>
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
-            -static-libgcc
-            -static-libstdc++
             $<$<BOOL:${BUILD_CPU_ONLY}>:fmt::fmt-header-only>
             $<$<BOOL:${BUILD_CPU_ONLY}>:spdlog::spdlog_header_only>
   )
@@ -225,7 +199,7 @@ endfunction()
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureAnnBench(
-    NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
+    NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
   )
 
 endif()
@@ -235,8 +209,8 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ)
     NAME
     RAFT_IVF_PQ
     PATH
-    bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft/raft_ivf_pq.cu>
+    src/raft/raft_benchmark.cu
+    src/raft/raft_ivf_pq.cu
     LINKS
     raft::compiled
   )
@@ -247,8 +221,8 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT)
     NAME
     RAFT_IVF_FLAT
     PATH
-    bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft/raft_ivf_flat.cu>
+    src/raft/raft_benchmark.cu
+    src/raft/raft_ivf_flat.cu
     LINKS
     raft::compiled
   )
@@ -256,7 +230,7 @@ endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE)
   ConfigureAnnBench(
-    NAME RAFT_BRUTE_FORCE PATH bench/ann/src/raft/raft_benchmark.cu LINKS raft::compiled
+    NAME RAFT_BRUTE_FORCE PATH src/raft/raft_benchmark.cu LINKS raft::compiled
   )
 endif()
 
@@ -265,8 +239,11 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
     NAME
     RAFT_CAGRA
     PATH
-    bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra.cu>
+    src/raft/raft_benchmark.cu
+    src/raft/raft_cagra_float.cu
+    src/raft/raft_cagra_half.cu
+    src/raft/raft_cagra_int8_t.cu
+    src/raft/raft_cagra_uint8_t.cu
     LINKS
     raft::compiled
   )
@@ -274,76 +251,63 @@ endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
   ConfigureAnnBench(
-    NAME RAFT_CAGRA_HNSWLIB PATH bench/ann/src/raft/raft_cagra_hnswlib.cu LINKS raft::compiled
+    NAME RAFT_CAGRA_HNSWLIB PATH src/raft/raft_cagra_hnswlib.cu LINKS raft::compiled
     hnswlib::hnswlib
   )
 endif()
 
-set(RAFT_FAISS_TARGETS faiss::faiss)
-if(TARGET faiss::faiss_avx2)
-  set(RAFT_FAISS_TARGETS faiss::faiss_avx2)
-endif()
-
 message("RAFT_FAISS_TARGETS: ${RAFT_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_PQ PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT)
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS
+    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ)
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS
+    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT)
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
+    NAME FAISS_GPU_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)
-  include(cmake/thirdparty/get_glog.cmake)
-  ConfigureAnnBench(NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu LINKS glog::glog ggnn::ggnn)
+  include(cmake/thirdparty/get_glog)
+  ConfigureAnnBench(NAME GGNN PATH src/ggnn/ggnn_benchmark.cu LINKS glog::glog ggnn::ggnn)
 endif()
 
 # ##################################################################################################
 # * Dynamically-loading ANN_BENCH executable -------------------------------------------------------
 if(RAFT_ANN_BENCH_SINGLE_EXE)
-  add_executable(ANN_BENCH bench/ann/src/common/benchmark.cpp)
-
-  # Build and link static version of the GBench to keep ANN_BENCH self-contained.
-  get_target_property(TMP_PROP benchmark::benchmark SOURCES)
-  add_library(benchmark_static STATIC ${TMP_PROP})
-  get_target_property(TMP_PROP benchmark::benchmark INCLUDE_DIRECTORIES)
-  target_include_directories(benchmark_static PUBLIC ${TMP_PROP})
-  get_target_property(TMP_PROP benchmark::benchmark LINK_LIBRARIES)
-  target_link_libraries(benchmark_static PUBLIC ${TMP_PROP})
+  add_executable(ANN_BENCH src/common/benchmark.cpp)
 
   target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
@@ -351,7 +315,7 @@ if(RAFT_ANN_BENCH_SINGLE_EXE)
     ANN_BENCH
     PRIVATE raft::raft
             nlohmann_json::nlohmann_json
-            benchmark_static
+            benchmark::benchmark
             dl
             -static-libgcc
             fmt::fmt-header-only
diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index c6213059dc..b010063dee 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -73,6 +73,8 @@ struct AlgoProperty {
 
 class AnnBase {
  public:
+  using index_type = size_t;
+
   inline AnnBase(Metric metric, int dim) : metric_(metric), dim_(dim) {}
   virtual ~AnnBase() noexcept = default;
 
@@ -98,7 +100,16 @@ class AnnGPU {
    * end.
    */
   [[nodiscard]] virtual auto get_sync_stream() const noexcept -> cudaStream_t = 0;
-  virtual ~AnnGPU() noexcept                                                  = default;
+  /**
+   * By default a GPU algorithm uses a fixed stream to order GPU operations.
+   * However, an algorithm may need to synchronize with the host at the end of its execution.
+   * In that case, also synchronizing with a benchmark event would put it at disadvantage.
+   *
+   * We can disable event sync by passing `false` here
+   *   - ONLY IF THE ALGORITHM HAS PRODUCED ITS OUTPUT BY THE TIME IT SYNCHRONIZES WITH CPU.
+   */
+  [[nodiscard]] virtual auto uses_stream() const noexcept -> bool { return true; }
+  virtual ~AnnGPU() noexcept = default;
 };
 
 template <typename T>
@@ -118,8 +129,11 @@ class ANN : public AnnBase {
   virtual void set_search_param(const AnnSearchParam& param) = 0;
   // TODO: this assumes that an algorithm can always return k results.
   // This is not always possible.
-  virtual void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const = 0;
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      AnnBase::index_type* neighbors,
+                      float* distances) const = 0;
 
   virtual void save(const std::string& file) const = 0;
   virtual void load(const std::string& file)       = 0;
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index d7bcd17a00..8762ccd1fe 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -280,10 +280,16 @@ void bench_search(::benchmark::State& state,
   /**
    * Each thread will manage its own outputs
    */
-  std::shared_ptr<buf<float>> distances =
-    std::make_shared<buf<float>>(current_algo_props->query_memory_type, k * query_set_size);
-  std::shared_ptr<buf<std::size_t>> neighbors =
-    std::make_shared<buf<std::size_t>>(current_algo_props->query_memory_type, k * query_set_size);
+  using index_type                 = AnnBase::index_type;
+  constexpr size_t kAlignResultBuf = 64;
+  size_t result_elem_count         = k * query_set_size;
+  result_elem_count =
+    ((result_elem_count + kAlignResultBuf - 1) / kAlignResultBuf) * kAlignResultBuf;
+  auto& result_buf =
+    get_result_buffer_from_global_pool(result_elem_count * (sizeof(float) + sizeof(index_type)));
+  auto* neighbors_ptr =
+    reinterpret_cast<index_type*>(result_buf.data(current_algo_props->query_memory_type));
+  auto* distances_ptr = reinterpret_cast<float*>(neighbors_ptr + result_elem_count);
 
   {
     nvtx_case nvtx{state.name()};
@@ -305,8 +311,8 @@ void bench_search(::benchmark::State& state,
         algo->search(query_set + batch_offset * dataset->dim(),
                      n_queries,
                      k,
-                     neighbors->data + out_offset * k,
-                     distances->data + out_offset * k);
+                     neighbors_ptr + out_offset * k,
+                     distances_ptr + out_offset * k);
       } catch (const std::exception& e) {
         state.SkipWithError("Benchmark loop: " + std::string(e.what()));
         break;
@@ -338,12 +344,13 @@ void bench_search(::benchmark::State& state,
   // Each thread calculates recall on their partition of queries.
   // evaluate recall
   if (dataset->max_k() >= k) {
-    const std::int32_t* gt          = dataset->gt_set();
-    const std::uint32_t max_k       = dataset->max_k();
-    buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
-    std::size_t rows                = std::min(queries_processed, query_set_size);
-    std::size_t match_count         = 0;
-    std::size_t total_count         = rows * static_cast<size_t>(k);
+    const std::int32_t* gt    = dataset->gt_set();
+    const std::uint32_t max_k = dataset->max_k();
+    result_buf.transfer_data(MemoryType::Host, current_algo_props->query_memory_type);
+    auto* neighbors_host    = reinterpret_cast<index_type*>(result_buf.data(MemoryType::Host));
+    std::size_t rows        = std::min(queries_processed, query_set_size);
+    std::size_t match_count = 0;
+    std::size_t total_count = rows * static_cast<size_t>(k);
 
     // We go through the groundtruth with same stride as the benchmark loop.
     size_t out_offset   = 0;
@@ -354,7 +361,7 @@ void bench_search(::benchmark::State& state,
         size_t i_out_idx  = out_offset + i;
         if (i_out_idx < rows) {
           for (std::uint32_t j = 0; j < k; j++) {
-            auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]);
+            auto act_idx = std::int32_t(neighbors_host[i_out_idx * k + j]);
             for (std::uint32_t l = 0; l < k; l++) {
               auto exp_idx = gt[i_orig_idx * max_k + l];
               if (act_idx == exp_idx) {
@@ -717,7 +724,7 @@ inline auto run_main(int argc, char** argv) -> int
   // to a shared library it depends on (dynamic benchmark executable).
   current_algo.reset();
   current_algo_props.reset();
-  reset_global_stream_pool();
+  reset_global_device_resources();
   return 0;
 }
 };  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index 6cdff316e9..96185c79eb 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -56,57 +56,6 @@ inline thread_local int benchmark_thread_id = 0;
  */
 inline thread_local int benchmark_n_threads = 1;
 
-template <typename T>
-struct buf {
-  MemoryType memory_type;
-  std::size_t size;
-  T* data;
-  buf(MemoryType memory_type, std::size_t size)
-    : memory_type(memory_type), size(size), data(nullptr)
-  {
-    switch (memory_type) {
-#ifndef BUILD_CPU_ONLY
-      case MemoryType::Device: {
-        cudaMalloc(reinterpret_cast<void**>(&data), size * sizeof(T));
-        cudaMemset(data, 0, size * sizeof(T));
-      } break;
-#endif
-      default: {
-        data = reinterpret_cast<T*>(malloc(size * sizeof(T)));
-        std::memset(data, 0, size * sizeof(T));
-      }
-    }
-  }
-  ~buf() noexcept
-  {
-    if (data == nullptr) { return; }
-    switch (memory_type) {
-#ifndef BUILD_CPU_ONLY
-      case MemoryType::Device: {
-        cudaFree(data);
-      } break;
-#endif
-      default: {
-        free(data);
-      }
-    }
-  }
-
-  [[nodiscard]] auto move(MemoryType target_memory_type) -> buf<T>
-  {
-    buf<T> r{target_memory_type, size};
-#ifndef BUILD_CPU_ONLY
-    if ((memory_type == MemoryType::Device && target_memory_type != MemoryType::Device) ||
-        (memory_type != MemoryType::Device && target_memory_type == MemoryType::Device)) {
-      cudaMemcpy(r.data, data, size * sizeof(T), cudaMemcpyDefault);
-      return r;
-    }
-#endif
-    std::swap(data, r.data);
-    return r;
-  }
-};
-
 struct cuda_timer {
  private:
   std::optional<cudaStream_t> stream_;
@@ -118,7 +67,9 @@ struct cuda_timer {
   static inline auto extract_stream(AnnT* algo) -> std::optional<cudaStream_t>
   {
     auto gpu_ann = dynamic_cast<AnnGPU*>(algo);
-    if (gpu_ann != nullptr) { return std::make_optional(gpu_ann->get_sync_stream()); }
+    if (gpu_ann != nullptr && gpu_ann->uses_stream()) {
+      return std::make_optional(gpu_ann->get_sync_stream());
+    }
     return std::nullopt;
   }
 
@@ -242,16 +193,102 @@ inline auto get_stream_from_global_pool() -> cudaStream_t
 #endif
 }
 
+struct result_buffer {
+  explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
+  {
+    if (size_ == 0) { return; }
+    data_host_ = malloc(size_);
+#ifndef BUILD_CPU_ONLY
+    cudaMallocAsync(&data_device_, size_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+  result_buffer()                                = delete;
+  result_buffer(result_buffer&&)                 = delete;
+  result_buffer& operator=(result_buffer&&)      = delete;
+  result_buffer(const result_buffer&)            = delete;
+  result_buffer& operator=(const result_buffer&) = delete;
+  ~result_buffer() noexcept
+  {
+    if (size_ == 0) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaFreeAsync(data_device_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+    free(data_host_);
+  }
+
+  [[nodiscard]] auto size() const noexcept { return size_; }
+  [[nodiscard]] auto data(ann::MemoryType loc) const noexcept
+  {
+    switch (loc) {
+      case MemoryType::Device: return data_device_;
+      default: return data_host_;
+    }
+  }
+
+  void transfer_data(ann::MemoryType dst, ann::MemoryType src)
+  {
+    auto dst_ptr = data(dst);
+    auto src_ptr = data(src);
+    if (dst_ptr == src_ptr) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaMemcpyAsync(dst_ptr, src_ptr, size_, cudaMemcpyDefault, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+
+ private:
+  size_t size_{0};
+  cudaStream_t stream_ = nullptr;
+  void* data_host_     = nullptr;
+  void* data_device_   = nullptr;
+};
+
+namespace detail {
+inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
+inline std::mutex grp_mutex;
+}  // namespace detail
+
+/**
+ * Get a result buffer associated with the current benchmark thread.
+ *
+ * Note, the allocations are reused between the benchmark cases.
+ * This reduces the setup overhead and number of times the context is being blocked
+ * (this is relevant if there is a persistent kernel running across multiples benchmark cases).
+ */
+inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> result_buffer& {
+    std::lock_guard guard(detail::grp_mutex);
+    if (static_cast<int>(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_result_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    return *rb;
+  }();
+
+  memset(rb.data(MemoryType::Host), 0, size);
+#ifndef BUILD_CPU_ONLY
+  cudaMemsetAsync(rb.data(MemoryType::Device), 0, size, stream);
+  cudaStreamSynchronize(stream);
+#endif
+  return rb;
+}
+
 /**
- * Delete all streams in the global pool.
+ * Delete all streams and memory allocations in the global pool.
  * It's called at the end of the `main` function - before global/static variables and cuda context
  * is destroyed - to make sure they are destroyed gracefully and correctly seen by analysis tools
  * such as nsys.
  */
-inline void reset_global_stream_pool()
+inline void reset_global_device_resources()
 {
 #ifndef BUILD_CPU_ONLY
   std::lock_guard guard(detail::gsp_mutex);
+  detail::global_result_buffer_pool.resize(0);
   detail::global_stream_pool.resize(0);
 #endif
 }
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
index 407f7148df..3caca15b7f 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -88,8 +88,11 @@ class FaissCpu : public ANN<T> {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   AlgoProperty get_preference() const override
   {
@@ -169,7 +172,7 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
 
 template <typename T>
 void FaissCpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index 633098fd1d..2effe631e5 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -111,8 +111,11 @@ class FaissGpu : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -196,7 +199,7 @@ void FaissGpu<T>::build(const T* dataset, size_t nrow)
 
 template <typename T>
 void FaissGpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index c89f02d974..59cf3df806 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -58,8 +58,11 @@ class Ggnn : public ANN<T>, public AnnGPU {
   void build(const T* dataset, size_t nrow) override { impl_->build(dataset, nrow); }
 
   void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); }
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override
   {
     impl_->search(queries, batch_size, k, neighbors, distances);
   }
@@ -123,8 +126,11 @@ class GgnnImpl : public ANN<T>, public AnnGPU {
   void build(const T* dataset, size_t nrow) override;
 
   void set_search_param(const AnnSearchParam& param) override;
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override { return stream_; }
 
   void save(const std::string& file) const override;
@@ -243,7 +249,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_param(const AnnSearc
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
   if (k != KQuery) {
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index a8f7dd824f..5743632bf4 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -79,8 +79,11 @@ class HnswLib : public ANN<T> {
   void build(const T* dataset, size_t nrow) override;
 
   void set_search_param(const AnnSearchParam& param) override;
-  void search(
-    const T* query, int batch_size, int k, size_t* indices, float* distances) const override;
+  void search(const T* query,
+              int batch_size,
+              int k,
+              AnnBase::index_type* indices,
+              float* distances) const override;
 
   void save(const std::string& path_to_index) const override;
   void load(const std::string& path_to_index) override;
@@ -97,7 +100,10 @@ class HnswLib : public ANN<T> {
   void set_base_layer_only() { appr_alg_->base_layer_only = true; }
 
  private:
-  void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
+  void get_search_knn_results_(const T* query,
+                               int k,
+                               AnnBase::index_type* indices,
+                               float* distances) const;
 
   std::shared_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::shared_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
@@ -176,7 +182,7 @@ void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
 
 template <typename T>
 void HnswLib<T>::search(
-  const T* query, int batch_size, int k, size_t* indices, float* distances) const
+  const T* query, int batch_size, int k, AnnBase::index_type* indices, float* distances) const
 {
   auto f = [&](int i) {
     // hnsw can only handle a single vector at a time.
@@ -217,7 +223,7 @@ void HnswLib<T>::load(const std::string& path_to_index)
 template <typename T>
 void HnswLib<T>::get_search_knn_results_(const T* query,
                                          int k,
-                                         size_t* indices,
+                                         AnnBase::index_type* indices,
                                          float* distances) const
 {
   auto result = appr_alg_->searchKnn(query, k);
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index 40c1ecfa5e..9b086fdb23 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -19,14 +19,19 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/refine.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/failure_callback_resource_adaptor.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <memory>
@@ -70,13 +75,14 @@ inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool
  */
 class shared_raft_resources {
  public:
-  using pool_mr_type = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  using mr_type      = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
+  using pool_mr_type  = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  using mr_type       = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
+  using large_mr_type = rmm::mr::managed_memory_resource;
 
   shared_raft_resources()
   try : orig_resource_{rmm::mr::get_current_device_resource()},
     pool_resource_(orig_resource_, 1024 * 1024 * 1024ull),
-    resource_(&pool_resource_, rmm_oom_callback, nullptr) {
+    resource_(&pool_resource_, rmm_oom_callback, nullptr), large_mr_() {
     rmm::mr::set_current_device_resource(&resource_);
   } catch (const std::exception& e) {
     auto cuda_status = cudaGetLastError();
@@ -99,10 +105,16 @@ class shared_raft_resources {
 
   ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); }
 
+  auto get_large_memory_resource() noexcept
+  {
+    return static_cast<rmm::mr::device_memory_resource*>(&large_mr_);
+  }
+
  private:
   rmm::mr::device_memory_resource* orig_resource_;
   pool_mr_type pool_resource_;
   mr_type resource_;
+  large_mr_type large_mr_;
 };
 
 /**
@@ -121,8 +133,16 @@ class configured_raft_resources {
    * It's used by the copy constructor.
    */
   explicit configured_raft_resources(const std::shared_ptr<shared_raft_resources>& shared_res)
-    : shared_res_{shared_res}, res_{rmm::cuda_stream_view(get_stream_from_global_pool())}
+    : shared_res_{shared_res},
+      res_{std::make_unique<raft::device_resources>(
+        rmm::cuda_stream_view(get_stream_from_global_pool()))}
   {
+    // set the large workspace resource to the raft handle, but without the deleter
+    // (this resource is managed by the shared_res).
+    raft::resource::set_large_workspace_resource(
+      *res_,
+      std::shared_ptr<rmm::mr::device_memory_resource>(shared_res_->get_large_memory_resource(),
+                                                       raft::void_op{}));
   }
 
   /** Default constructor creates all resources anew. */
@@ -130,9 +150,9 @@ class configured_raft_resources {
   {
   }
 
-  configured_raft_resources(configured_raft_resources&&)            = default;
-  configured_raft_resources& operator=(configured_raft_resources&&) = default;
-  ~configured_raft_resources()                                      = default;
+  configured_raft_resources(configured_raft_resources&&);
+  configured_raft_resources& operator=(configured_raft_resources&&);
+  ~configured_raft_resources() = default;
   configured_raft_resources(const configured_raft_resources& res)
     : configured_raft_resources{res.shared_res_}
   {
@@ -143,11 +163,11 @@ class configured_raft_resources {
     return *this;
   }
 
-  operator raft::resources&() noexcept { return res_; }
-  operator const raft::resources&() const noexcept { return res_; }
+  operator raft::resources&() noexcept { return *res_; }
+  operator const raft::resources&() const noexcept { return *res_; }
 
   /** Get the main stream */
-  [[nodiscard]] auto get_sync_stream() const noexcept { return resource::get_cuda_stream(res_); }
+  [[nodiscard]] auto get_sync_stream() const noexcept { return resource::get_cuda_stream(*res_); }
 
  private:
   /** The resources shared among multiple raft handles / threads. */
@@ -156,7 +176,80 @@ class configured_raft_resources {
    * Until we make the use of copies of raft::resources thread-safe, each benchmark wrapper must
    * have its own copy of it.
    */
-  raft::device_resources res_;
+  std::unique_ptr<raft::device_resources> res_ = std::make_unique<raft::device_resources>();
 };
 
+inline configured_raft_resources::configured_raft_resources(configured_raft_resources&&) = default;
+inline configured_raft_resources& configured_raft_resources::operator=(
+  configured_raft_resources&&) = default;
+
+/** A helper to refine the neighbors when the data is on device or on host. */
+template <typename DatasetT, typename QueriesT, typename CandidatesT>
+void refine_helper(const raft::resources& res,
+                   DatasetT dataset,
+                   QueriesT queries,
+                   CandidatesT candidates,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances,
+                   raft::distance::DistanceType metric)
+{
+  using data_type    = typename DatasetT::value_type;
+  using index_type   = AnnBase::index_type;
+  using extents_type = index_type;  // device-side refine requires this
+
+  static_assert(std::is_same_v<data_type, typename QueriesT::value_type>);
+  static_assert(std::is_same_v<data_type, typename DatasetT::value_type>);
+  static_assert(std::is_same_v<index_type, typename CandidatesT::value_type>);
+
+  extents_type batch_size = queries.extent(0);
+  extents_type dim        = queries.extent(1);
+  extents_type k0         = candidates.extent(1);
+
+  if (raft::get_device_for_address(dataset.data_handle()) >= 0) {
+    auto dataset_device = raft::make_device_matrix_view<const data_type, extents_type>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    auto queries_device = raft::make_device_matrix_view<const data_type, extents_type>(
+      queries.data_handle(), batch_size, dim);
+    auto candidates_device = raft::make_device_matrix_view<const index_type, extents_type>(
+      candidates.data_handle(), batch_size, k0);
+    auto neighbors_device =
+      raft::make_device_matrix_view<index_type, extents_type>(neighbors, batch_size, k);
+    auto distances_device =
+      raft::make_device_matrix_view<float, extents_type>(distances, batch_size, k);
+
+    raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
+                                                                        dataset_device,
+                                                                        queries_device,
+                                                                        candidates_device,
+                                                                        neighbors_device,
+                                                                        distances_device,
+                                                                        metric);
+  } else {
+    auto dataset_host = raft::make_host_matrix_view<const data_type, extents_type>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
+    auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
+    auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
+    auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
+
+    auto stream = resource::get_cuda_stream(res);
+    raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
+    raft::copy(
+      candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+
+    raft::resource::sync_stream(res);  // wait for the queries and candidates
+    raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
+                                                                        dataset_host,
+                                                                        queries_host.view(),
+                                                                        candidates_host.view(),
+                                                                        neighbors_host.view(),
+                                                                        distances_host.view(),
+                                                                        metric);
+
+    raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
+    raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+  }
+}
+
 }  // namespace raft::bench::ann
diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/bench/ann/src/raft/raft_cagra_float.cu
similarity index 63%
rename from cpp/include/raft/util/memory_pool-ext.hpp
rename to cpp/bench/ann/src/raft/raft_cagra_float.cu
index 030a9c681e..058f5bf34a 100644
--- a/cpp/include/raft/util/memory_pool-ext.hpp
+++ b/cpp/bench/ann/src/raft/raft_cagra_float.cu
@@ -13,16 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "raft_cagra_wrapper.h"
 
-#pragma once
-#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
-
-#include <cstddef>  // size_t
-#include <memory>   // std::unique_ptr
-
-namespace raft {
-
-std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
-  rmm::mr::device_memory_resource*& mr, size_t initial_size);
-
-}  // namespace raft
+namespace raft::bench::ann {
+template class RaftCagra<float, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_half.cu b/cpp/bench/ann/src/raft/raft_cagra_half.cu
new file mode 100644
index 0000000000..a015819ec5
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_half.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<half, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
index 709b08db76..d9ef1d74a3 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
@@ -89,10 +90,11 @@ int main(int argc, char** argv)
   // and is initially sized to half of free device memory.
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(
-    &pool_mr);  // Updates the current device resource pointer to `pool_mr`
-  rmm::mr::device_memory_resource* mr =
-    rmm::mr::get_current_device_resource();  // Points to `pool_mr`
-  return raft::bench::ann::run_main(argc, argv);
+  // Updates the current device resource pointer to `pool_mr`
+  auto old_mr = rmm::mr::set_current_device_resource(&pool_mr);
+  auto ret    = raft::bench::ann::run_main(argc, argv);
+  // Restores the current device resource pointer to its previous value
+  rmm::mr::set_current_device_resource(old_mr);
+  return ret;
 }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
index ed9c120ed4..1c4b847d1a 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
@@ -41,10 +41,11 @@ class RaftCagraHnswlib : public ANN<T>, public AnnGPU {
 
   void set_search_param(const AnnSearchParam& param) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -99,7 +100,7 @@ void RaftCagraHnswlib<T, IdxT>::load(const std::string& file)
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
 }
diff --git a/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu b/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
new file mode 100644
index 0000000000..be3b83ee60
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<int8_t, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra.cu b/cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
similarity index 85%
rename from cpp/bench/ann/src/raft/raft_cagra.cu
rename to cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
index c0c1352a43..c9679e404d 100644
--- a/cpp/bench/ann/src/raft/raft_cagra.cu
+++ b/cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
@@ -17,7 +17,4 @@
 
 namespace raft::bench::ann {
 template class RaftCagra<uint8_t, uint32_t>;
-template class RaftCagra<int8_t, uint32_t>;
-template class RaftCagra<half, uint32_t>;
-template class RaftCagra<float, uint32_t>;
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 70fd22001e..0b892dec35 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -36,7 +36,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cassert>
 #include <fstream>
@@ -96,12 +96,16 @@ class RaftCagra : public ANN<T>, public AnnGPU {
 
   void set_search_dataset(const T* dataset, size_t nrow) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
-  void search_base(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances) const;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -138,7 +142,7 @@ class RaftCagra : public ANN<T>, public AnnGPU {
   std::shared_ptr<raft::device_matrix<T, int64_t, row_major>> dataset_;
   std::shared_ptr<raft::device_matrix_view<const T, int64_t, row_major>> input_dataset_v_;
 
-  inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type)
+  inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
   {
     switch (mem_type) {
       case (AllocatorType::HostPinned): return &mr_pinned_;
@@ -272,15 +276,18 @@ std::unique_ptr<ANN<T>> RaftCagra<T, IdxT>::copy()
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::search_base(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
+  static_assert(std::is_integral_v<AnnBase::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
   IdxT* neighbors_IdxT;
-  rmm::device_uvector<IdxT> neighbors_storage(0, resource::get_cuda_stream(handle_));
-  if constexpr (std::is_same_v<IdxT, size_t>) {
-    neighbors_IdxT = neighbors;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<IdxT*>(neighbors);
   } else {
-    neighbors_storage.resize(batch_size * k, resource::get_cuda_stream(handle_));
-    neighbors_IdxT = neighbors_storage.data();
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
   }
 
   auto queries_view =
@@ -291,76 +298,36 @@ void RaftCagra<T, IdxT>::search_base(
   raft::neighbors::cagra::search(
     handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
 
-  if constexpr (!std::is_same_v<IdxT, size_t>) {
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
     raft::linalg::unaryOp(neighbors,
                           neighbors_IdxT,
                           batch_size * k,
-                          raft::cast_op<size_t>(),
+                          raft::cast_op<AnnBase::index_type>(),
                           raft::resource::get_cuda_stream(handle_));
   }
 }
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   auto k0                       = static_cast<size_t>(refine_ratio_ * k);
   const bool disable_refinement = k0 <= static_cast<size_t>(k);
   const raft::resources& res    = handle_;
-  auto stream                   = resource::get_cuda_stream(res);
 
   if (disable_refinement) {
     search_base(queries, batch_size, k, neighbors, distances);
   } else {
-    auto candidate_ixs   = raft::make_device_matrix<int64_t, int64_t>(res, batch_size, k0);
-    auto candidate_dists = raft::make_device_matrix<float, int64_t>(res, batch_size, k0);
-    search_base(queries,
-                batch_size,
-                k0,
-                reinterpret_cast<size_t*>(candidate_ixs.data_handle()),
-                candidate_dists.data_handle());
-
-    if (raft::get_device_for_address(input_dataset_v_->data_handle()) >= 0) {
-      auto queries_v =
-        raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, dimension_);
-      auto neighours_v = raft::make_device_matrix_view<int64_t, int64_t>(
-        reinterpret_cast<int64_t*>(neighbors), batch_size, k);
-      auto distances_v = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
-      raft::neighbors::refine<int64_t, T, float, int64_t>(
-        res,
-        *input_dataset_v_,
-        queries_v,
-        raft::make_const_mdspan(candidate_ixs.view()),
-        neighours_v,
-        distances_v,
-        index_->metric());
-    } else {
-      auto dataset_host = raft::make_host_matrix_view<const T, int64_t>(
-        input_dataset_v_->data_handle(), input_dataset_v_->extent(0), input_dataset_v_->extent(1));
-      auto queries_host    = raft::make_host_matrix<T, int64_t>(batch_size, dimension_);
-      auto candidates_host = raft::make_host_matrix<int64_t, int64_t>(batch_size, k0);
-      auto neighbors_host  = raft::make_host_matrix<int64_t, int64_t>(batch_size, k);
-      auto distances_host  = raft::make_host_matrix<float, int64_t>(batch_size, k);
-
-      raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
-      raft::copy(
-        candidates_host.data_handle(), candidate_ixs.data_handle(), candidates_host.size(), stream);
-
-      raft::resource::sync_stream(res);  // wait for the queries and candidates
-      raft::neighbors::refine<int64_t, T, float, int64_t>(res,
-                                                          dataset_host,
-                                                          queries_host.view(),
-                                                          candidates_host.view(),
-                                                          neighbors_host.view(),
-                                                          distances_host.view(),
-                                                          index_->metric());
-
-      raft::copy(neighbors,
-                 reinterpret_cast<size_t*>(neighbors_host.data_handle()),
-                 neighbors_host.size(),
-                 stream);
-      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
-    }
+    auto queries_v =
+      raft::make_device_matrix_view<const T, AnnBase::index_type>(queries, batch_size, dimension_);
+    auto candidate_ixs =
+      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
+    search_base(
+      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
+    refine_helper(
+      res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
   }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 7f2996d77a..83a3a63aba 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -61,10 +61,11 @@ class RaftIvfFlatGpu : public ANN<T>, public AnnGPU {
 
   void set_search_param(const AnnSearchParam& param) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -131,10 +132,34 @@ std::unique_ptr<ANN<T>> RaftIvfFlatGpu<T, IdxT>::copy()
 
 template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
-  static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
-  raft::neighbors::ivf_flat::search(
-    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances);
+  static_assert(std::is_integral_v<AnnBase::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_IdxT;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
+  }
+  raft::neighbors::ivf_flat::search(handle_,
+                                    search_params_,
+                                    *index_,
+                                    queries,
+                                    batch_size,
+                                    k,
+                                    neighbors_IdxT,
+                                    distances,
+                                    resource::get_workspace_resource(handle_));
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_IdxT,
+                          batch_size * k,
+                          raft::cast_op<AnnBase::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 5d8b682264..7201467969 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -32,9 +32,6 @@
 #include <raft/neighbors/refine.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-
 #include <type_traits>
 
 namespace raft::bench::ann {
@@ -64,10 +61,16 @@ class RaftIvfPQ : public ANN<T>, public AnnGPU {
   void set_search_param(const AnnSearchParam& param) override;
   void set_search_dataset(const T* dataset, size_t nrow) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances) const;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -140,68 +143,61 @@ void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
   dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
 }
 
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::search_base(
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<AnnBase::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_IdxT;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
+  }
+
+  auto queries_view =
+    raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, dimension_);
+  auto neighbors_view =
+    raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_IdxT, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
+
+  raft::neighbors::ivf_pq::search(
+    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_IdxT,
+                          batch_size * k,
+                          raft::cast_op<AnnBase::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
+}
+
 template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
-  if (refine_ratio_ > 1.0f) {
-    uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
-    auto queries_v =
-      raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, index_->dim());
-    auto distances_tmp = raft::make_device_matrix<float, uint32_t>(handle_, batch_size, k0);
-    auto candidates    = raft::make_device_matrix<IdxT, uint32_t>(handle_, batch_size, k0);
-
-    raft::neighbors::ivf_pq::search(
-      handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view());
-
-    if (raft::get_device_for_address(dataset_.data_handle()) >= 0) {
-      auto queries_v =
-        raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
-      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
-      auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
-
-      raft::neighbors::refine<IdxT, T, float, IdxT>(handle_,
-                                                    dataset_,
-                                                    queries_v,
-                                                    candidates.view(),
-                                                    neighbors_v,
-                                                    distances_v,
-                                                    index_->metric());
-    } else {
-      auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
-      auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
-      auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
-      auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
-
-      auto stream = resource::get_cuda_stream(handle_);
-      raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
-      raft::copy(
-        candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
-
-      auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
-        dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1));
-
-      raft::resource::sync_stream(handle_);  // wait for the queries and candidates
-      raft::neighbors::refine<IdxT, T, float, IdxT>(handle_,
-                                                    dataset_v,
-                                                    queries_host.view(),
-                                                    candidates_host.view(),
-                                                    neighbors_host.view(),
-                                                    distances_host.view(),
-                                                    index_->metric());
-
-      raft::copy(neighbors, (size_t*)neighbors_host.data_handle(), neighbors_host.size(), stream);
-      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
-    }
+  auto k0                       = static_cast<size_t>(refine_ratio_ * k);
+  const bool disable_refinement = k0 <= static_cast<size_t>(k);
+  const raft::resources& res    = handle_;
+
+  if (disable_refinement) {
+    search_base(queries, batch_size, k, neighbors, distances);
   } else {
     auto queries_v =
-      raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, index_->dim());
-    auto neighbors_v =
-      raft::make_device_matrix_view<IdxT, uint32_t>((IdxT*)neighbors, batch_size, k);
-    auto distances_v = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
-
-    raft::neighbors::ivf_pq::search(
-      handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
+      raft::make_device_matrix_view<const T, AnnBase::index_type>(queries, batch_size, dimension_);
+    auto candidate_ixs =
+      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
+    search_base(
+      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
+    refine_helper(
+      res, dataset_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
   }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index 586b81ae06..2c996058b2 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -56,10 +56,11 @@ class RaftGpu : public ANN<T>, public AnnGPU {
 
   void set_search_param(const AnnSearchParam& param) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   // to enable dataset access from GPU memory
   AlgoProperty get_preference() const override
@@ -133,15 +134,16 @@ void RaftGpu<T>::load(const std::string& file)
 
 template <typename T>
 void RaftGpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   auto queries_view =
     raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
 
-  auto neighbors_view = raft::make_device_matrix_view<size_t, int64_t>(neighbors, batch_size, k);
+  auto neighbors_view =
+    raft::make_device_matrix_view<AnnBase::index_type, int64_t>(neighbors, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
 
-  raft::neighbors::brute_force::search<T, size_t>(
+  raft::neighbors::brute_force::search<T, AnnBase::index_type>(
     handle_, *index_, queries_view, neighbors_view, distances_view);
 }
 
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9f23c44a5c..0771a60e58 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -75,31 +75,31 @@ endfunction()
 
 if(BUILD_PRIMS_BENCH)
   ConfigureBench(
-    NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
+    NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp
   )
 
   ConfigureBench(
-    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
-    bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME CLUSTER_BENCH PATH cluster/kmeans_balanced.cu cluster/kmeans.cu
+    main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
-    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
-    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
+    NAME TUNE_DISTANCE PATH distance/tune_pairwise/kernel.cu
+    distance/tune_pairwise/bench.cu main.cpp
   )
 
   ConfigureBench(
     NAME
     DISTANCE_BENCH
     PATH
-    bench/prims/distance/distance_cosine.cu
-    bench/prims/distance/distance_exp_l2.cu
-    bench/prims/distance/distance_l1.cu
-    bench/prims/distance/distance_unexp_l2.cu
-    bench/prims/distance/fused_l2_nn.cu
-    bench/prims/distance/masked_nn.cu
-    bench/prims/distance/kernels.cu
-    bench/prims/main.cpp
+    distance/distance_cosine.cu
+    distance/distance_exp_l2.cu
+    distance/distance_l1.cu
+    distance/distance_unexp_l2.cu
+    distance/fused_l2_nn.cu
+    distance/masked_nn.cu
+    distance/kernels.cu
+    main.cpp
     OPTIONAL
     LIB
     EXPLICIT_INSTANTIATE_ONLY
@@ -109,63 +109,64 @@ if(BUILD_PRIMS_BENCH)
     NAME
     LINALG_BENCH
     PATH
-    bench/prims/linalg/add.cu
-    bench/prims/linalg/map_then_reduce.cu
-    bench/prims/linalg/matrix_vector_op.cu
-    bench/prims/linalg/norm.cu
-    bench/prims/linalg/normalize.cu
-    bench/prims/linalg/reduce_cols_by_key.cu
-    bench/prims/linalg/reduce_rows_by_key.cu
-    bench/prims/linalg/reduce.cu
-    bench/prims/linalg/sddmm.cu
-    bench/prims/main.cpp
+    linalg/add.cu
+    linalg/map_then_reduce.cu
+    linalg/matrix_vector_op.cu
+    linalg/norm.cu
+    linalg/normalize.cu
+    linalg/reduce_cols_by_key.cu
+    linalg/reduce_rows_by_key.cu
+    linalg/reduce.cu
+    linalg/sddmm.cu
+    main.cpp
   )
 
   ConfigureBench(
-    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_BENCH PATH matrix/argmin.cu matrix/gather.cu
+    matrix/select_k.cu main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
-    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-   bench/prims/random/rng.cu bench/prims/random/subsample.cu bench/prims/main.cpp
+    NAME RANDOM_BENCH PATH random/make_blobs.cu random/permute.cu
+   random/rng.cu random/subsample.cu main.cpp
   )
 
   ConfigureBench(
     NAME
     SPARSE_BENCH
     PATH
-    bench/prims/sparse/bitmap_to_csr.cu
-    bench/prims/sparse/convert_csr.cu
-    bench/prims/main.cpp
+    sparse/bitmap_to_csr.cu
+    sparse/convert_csr.cu
+    sparse/select_k_csr.cu
+    main.cpp
   )
 
   ConfigureBench(
     NAME
     NEIGHBORS_BENCH
     PATH
-    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
-    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
-    bench/prims/neighbors/knn/cagra_float_uint32_t.cu
-    bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
-    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
-    bench/prims/neighbors/refine_float_int64_t.cu
-    bench/prims/neighbors/refine_uint8_t_int64_t.cu
-    bench/prims/main.cpp
+    neighbors/knn/brute_force_float_int64_t.cu
+    neighbors/knn/brute_force_float_uint32_t.cu
+    neighbors/knn/cagra_float_uint32_t.cu
+    neighbors/knn/ivf_flat_filter_float_int64_t.cu
+    neighbors/knn/ivf_flat_float_int64_t.cu
+    neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+    neighbors/knn/ivf_pq_float_int64_t.cu
+    neighbors/knn/ivf_pq_filter_float_int64_t.cu
+    neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
+    neighbors/refine_float_int64_t.cu
+    neighbors/refine_uint8_t_int64_t.cu
+    main.cpp
     OPTIONAL
     LIB
     EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/bench/prims/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp
index 4ecad6df3d..3ce43cc1e7 100644
--- a/cpp/bench/prims/common/benchmark.hpp
+++ b/cpp/bench/prims/common/benchmark.hpp
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
index 078f9e6198..876e47525c 100644
--- a/cpp/bench/prims/matrix/gather.cu
+++ b/cpp/bench/prims/matrix/gather.cu
@@ -24,6 +24,7 @@
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft::bench::matrix {
diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
index aea7168142..6499078623 100644
--- a/cpp/bench/prims/neighbors/knn.cuh
+++ b/cpp/bench/prims/neighbors/knn.cuh
@@ -27,10 +27,12 @@
 #include <raft/spatial/knn/knn.cuh>
 #include <raft/util/itertools.hpp>
 
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/host/new_delete_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 
@@ -101,7 +103,7 @@ struct device_resource {
     if (managed_) { delete res_; }
   }
 
-  [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; }
+  [[nodiscard]] auto get() const -> rmm::device_async_resource_ref { return res_; }
 
  private:
   const bool managed_;
@@ -158,8 +160,15 @@ struct ivf_flat_knn {
               IdxT* out_idxs)
   {
     search_params.n_probes = 20;
-    raft::neighbors::ivf_flat::search(
-      handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists);
+    raft::neighbors::ivf_flat::search(handle,
+                                      search_params,
+                                      *index,
+                                      search_items,
+                                      ps.n_queries,
+                                      ps.k,
+                                      out_idxs,
+                                      out_dists,
+                                      resource::get_workspace_resource(handle));
   }
 };
 
diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
index 4c8ca2bf31..70a9c65e0d 100644
--- a/cpp/bench/prims/random/subsample.cu
+++ b/cpp/bench/prims/random/subsample.cu
@@ -27,6 +27,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cub/cub.cuh>
diff --git a/cpp/bench/prims/sparse/select_k_csr.cu b/cpp/bench/prims/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..a91e6c8514
--- /dev/null
+++ b/cpp/bench/prims/sparse/select_k_csr.cu
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min         = true;
+  bool customized_indices = false;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << params.n_rows << "#" << params.n_cols << "#" << params.top_k << "#" << params.sparsity;
+  return os;
+}
+
+template <typename value_t, typename index_t>
+struct SelectKCsrTest : public fixture {
+  SelectKCsrTest(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      values_d(0, stream),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      dst_values_d(0, stream),
+      dst_indices_d(0, stream)
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    if (nnz) {
+      auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, nnz);
+      auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+      raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 nnz,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-10.0f),
+                                                 value_t(10.0f),
+                                                 uint64_t(2024));
+      raft::copy(values_d.data(), blobs_values.data_handle(), nnz, stream);
+      resource::sync_stream(handle);
+    }
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
+      raft::sparse::matrix::select_k(
+        handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
+      resource::sync_stream(handle);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<index_t> dst_indices_d;
+};  // struct SelectKCsrTest
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    index_t k;
+  };
+
+  const std::vector<TestParams> params_group{
+    {20000, 500, 1},    {20000, 500, 2},    {20000, 500, 4},   {20000, 500, 8},
+    {20000, 500, 16},   {20000, 500, 32},   {20000, 500, 64},  {20000, 500, 128},
+    {20000, 500, 256},
+
+    {1000, 10000, 1},   {1000, 10000, 2},   {1000, 10000, 4},  {1000, 10000, 8},
+    {1000, 10000, 16},  {1000, 10000, 32},  {1000, 10000, 64}, {1000, 10000, 128},
+    {1000, 10000, 256},
+
+    {100, 100000, 1},   {100, 100000, 2},   {100, 100000, 4},  {100, 100000, 8},
+    {100, 100000, 16},  {100, 100000, 32},  {100, 100000, 64}, {100, 100000, 128},
+    {100, 100000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256}};
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.1}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.2}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.5}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, uint32_t>), "", getInputs<uint32_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index ea8a077b0c..b364d8418d 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,8 +13,8 @@
 # =============================================================================
 
 if(DISABLE_DEPRECATION_WARNINGS)
-  list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations)
-  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
+  list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
 endif()
 
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
diff --git a/cpp/cmake/patches/faiss_override.json b/cpp/cmake/patches/faiss_override.json
new file mode 100644
index 0000000000..19dad362b9
--- /dev/null
+++ b/cpp/cmake/patches/faiss_override.json
@@ -0,0 +1,9 @@
+{
+  "packages" : {
+    "faiss" : {
+      "version": "1.7.4",
+      "git_url": "https://github.com/facebookresearch/faiss.git",
+      "git_tag": "main"
+    }
+  }
+}
diff --git a/cpp/cmake/patches/ggnn_override.json b/cpp/cmake/patches/ggnn_override.json
new file mode 100644
index 0000000000..768fae8b0c
--- /dev/null
+++ b/cpp/cmake/patches/ggnn_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "ggnn" : {
+      "version": "0.5",
+      "git_url": "https://github.com/cgtuebingen/ggnn.git",
+      "git_tag": "release_${version}",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/ggnn.diff",
+          "issue" : "Correct compilation issues",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json
new file mode 100644
index 0000000000..d6ab8a18a5
--- /dev/null
+++ b/cpp/cmake/patches/hnswlib_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "hnswlib" : {
+      "version": "0.6.2",
+      "git_url": "https://github.com/nmslib/hnswlib.git",
+      "git_tag": "v${version}",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/hnswlib.diff",
+          "issue" : "Correct compilation issues",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 85829554ae..288da763bf 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,96 +15,104 @@
 #=============================================================================
 
 function(find_and_configure_faiss)
-    set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU)
-    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN} )
+  set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+  rapids_find_generate_module(faiss
+    HEADER_NAMES  faiss/IndexFlat.h
+    LIBRARY_NAMES faiss
+    )
+
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/faiss_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(faiss version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(faiss ${version} patch_command)
+
+  set(BUILD_SHARED_LIBS ON)
+  if (PKG_BUILD_STATIC_LIBS)
+    set(BUILD_SHARED_LIBS OFF)
+    set(CPM_DOWNLOAD_faiss ON)
+  endif()
+
+  include(cmake/modules/FindAVX)
+  # Link against AVX CPU lib if it exists
+  set(RAFT_FAISS_OPT_LEVEL "generic")
+  if(CXX_AVX2_FOUND)
+    set(RAFT_FAISS_OPT_LEVEL "avx2")
+  endif()
+
+  rapids_cpm_find(faiss ${version}
+    GLOBAL_TARGETS faiss faiss_avx2 faiss_gpu faiss::faiss faiss::faiss_avx2
+    CPM_ARGS
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
+    EXCLUDE_FROM_ALL ${exclude}
+    OPTIONS
+    "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
+    "FAISS_ENABLE_PYTHON OFF"
+    "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}"
+    "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+    "BUILD_TESTING OFF"
+    "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+    )
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(hnswlib)
+
+  if(TARGET faiss AND NOT TARGET faiss::faiss)
+    add_library(faiss::faiss ALIAS faiss)
+    # We need to ensure that faiss has all the conda information. So we use this approach so that
+    # faiss will have the conda includes/link dirs
+    target_link_libraries(faiss PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>)
+  endif()
+  if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2)
+    add_library(faiss::faiss_avx2 ALIAS faiss_avx2)
+    # We need to ensure that faiss has all the conda information. So we use this approach so that
+    # faiss will have the conda includes/link dirs
+    target_link_libraries(faiss_avx2 PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>)
+  endif()
+  if(TARGET faiss_gpu AND NOT TARGET faiss::faiss_gpu)
+    add_library(faiss::faiss_gpu ALIAS faiss_gpu)
+    # We need to ensure that faiss has all the conda information. So we use this approach so that
+    # faiss will have the conda includes/link dirs
+    target_link_libraries(faiss_gpu PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>)
+  endif()
+
+  if(faiss_ADDED)
+    rapids_export(BUILD faiss
+                  EXPORT_SET faiss-targets
+                  GLOBAL_TARGETS ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}
+                  NAMESPACE faiss::)
+  endif()
+
+  # Need to tell CMake to rescan the link group of faiss::faiss_gpu and faiss
+  # so that we get proper link order when they are static
+  #
+  # We don't look at the existence of `faiss_avx2` as it will always exist
+  # even when CXX_AVX2_FOUND is false. In addition for arm builds the
+  # faiss_avx2 is marked as `EXCLUDE_FROM_ALL` so we don't want to add
+  # a dependency to it. Adding a dependency will cause it to compile,
+  # and fail due to invalid compiler flags.
+  if(PKG_ENABLE_GPU AND PKG_BUILD_STATIC_LIBS AND CXX_AVX2_FOUND)
+    set(RAFT_FAISS_TARGETS "$<LINK_GROUP:RESCAN,$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>,faiss::faiss_avx2>" PARENT_SCOPE)
+  elseif(PKG_ENABLE_GPU AND  PKG_BUILD_STATIC_LIBS)
+    set(RAFT_FAISS_TARGETS "$<LINK_GROUP:RESCAN,$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>,faiss::faiss>" PARENT_SCOPE)
+  elseif(CXX_AVX2_FOUND)
+    set(RAFT_FAISS_TARGETS faiss::faiss_avx2 PARENT_SCOPE)
+  else()
+    set(RAFT_FAISS_TARGETS faiss::faiss PARENT_SCOPE)
+  endif()
 
-        rapids_find_generate_module(faiss
-                HEADER_NAMES  faiss/IndexFlat.h
-                LIBRARY_NAMES faiss
-                )
-
-        set(BUILD_SHARED_LIBS ON)
-        if (PKG_BUILD_STATIC_LIBS)
-            set(BUILD_SHARED_LIBS OFF)
-            set(CPM_DOWNLOAD_faiss ON)
-        endif()
-
-        include(cmake/modules/FindAVX.cmake)
-
-        # Link against AVX CPU lib if it exists
-        set(RAFT_FAISS_GLOBAL_TARGETS faiss::faiss)
-        set(RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss)
-        set(RAFT_FAISS_OPT_LEVEL "generic")
-        if(CXX_AVX_FOUND)
-            set(RAFT_FAISS_OPT_LEVEL "avx2")
-            list(APPEND RAFT_FAISS_GLOBAL_TARGETS faiss::faiss_avx2)
-            list(APPEND RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss_avx2)
-        endif()
-
-        rapids_cpm_find(faiss ${PKG_VERSION}
-                GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS}
-                CPM_ARGS
-                GIT_REPOSITORY   ${PKG_REPOSITORY}
-                GIT_TAG          ${PKG_PINNED_TAG}
-                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
-                OPTIONS
-                "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
-                "FAISS_ENABLE_PYTHON OFF"
-                "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}"
-                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
-                "BUILD_TESTING OFF"
-                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
-                )
-
-        if(TARGET faiss AND NOT TARGET faiss::faiss)
-            add_library(faiss::faiss ALIAS faiss)
-        endif()
-
-    if(CXX_AVX_FOUND)
-
-        if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2)
-            add_library(faiss::faiss_avx2 ALIAS faiss_avx2)
-        endif()
-    endif()
-
-
-    if(faiss_ADDED)
-            rapids_export(BUILD faiss
-                    EXPORT_SET faiss-targets
-                    GLOBAL_TARGETS ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}
-                    NAMESPACE faiss::)
-        endif()
-
-    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
-    rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it
-    rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS})
-    rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS})
-
-    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=]
-                                    EXPORT_SET raft-ann-bench-exports)
 endfunction()
 
-if(NOT RAFT_FAISS_GIT_TAG)
-    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-    # (https://github.com/facebookresearch/faiss/pull/2446)
-    set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk)
-    # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
-endif()
-
-if(NOT RAFT_FAISS_GIT_REPOSITORY)
-    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-    # (https://github.com/facebookresearch/faiss/pull/2446)
-    set(RAFT_FAISS_GIT_REPOSITORY https://github.com/cjnolet/faiss.git)
-    # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
-endif()
-
-find_and_configure_faiss(VERSION    1.7.4
-        REPOSITORY  ${RAFT_FAISS_GIT_REPOSITORY}
-        PINNED_TAG  ${RAFT_FAISS_GIT_TAG}
-        BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
-        EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL}
-        ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU})
 
+find_and_configure_faiss(
+  BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
+  ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU}
+)
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
index 8137ef84eb..d8af4971a7 100644
--- a/cpp/cmake/thirdparty/get_ggnn.cmake
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -15,29 +15,31 @@
 #=============================================================================
 
 function(find_and_configure_ggnn)
-  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-          "${multiValueArgs}" ${ARGN} )
 
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/ggnn_override.json")
 
-  set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.diff")
-  set(patch_issues_to_ref "fix compile issues")
-  set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/patch.cmake")
-  set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/log")
-  string(TIMESTAMP current_year "%Y" UTC)
-  configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}"
-                @ONLY)
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(ggnn version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(ggnn ${version} patch_command)
 
   rapids_cpm_find(
-    ggnn ${PKG_VERSION}
+    ggnn ${version}
     GLOBAL_TARGETS ggnn::ggnn
     CPM_ARGS
-    GIT_REPOSITORY ${PKG_REPOSITORY}
-    GIT_TAG ${PKG_PINNED_TAG}
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
+    EXCLUDE_FROM_ALL ${exclude}
     DOWNLOAD_ONLY ON
-    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
   )
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(ggnn)
+
   if(NOT TARGET ggnn::ggnn)
     add_library(ggnn INTERFACE)
     target_include_directories(ggnn INTERFACE "$<BUILD_INTERFACE:${ggnn_SOURCE_DIR}/include>")
@@ -45,14 +47,4 @@ function(find_and_configure_ggnn)
   endif()
 
 endfunction()
-if(NOT RAFT_GGNN_GIT_TAG)
-  set(RAFT_GGNN_GIT_TAG release_0.5)
-endif()
-
-if(NOT RAFT_GGNN_GIT_REPOSITORY)
-  set(RAFT_GGNN_GIT_REPOSITORY https://github.com/cgtuebingen/ggnn.git)
-endif()
-find_and_configure_ggnn(VERSION 0.5
-        REPOSITORY       ${RAFT_GGNN_GIT_REPOSITORY}
-        PINNED_TAG       ${RAFT_GGNN_GIT_TAG}
-        )
+find_and_configure_ggnn()
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 4d28e9a064..6ef493336f 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -15,78 +15,74 @@
 #=============================================================================
 
 function(find_and_configure_hnswlib)
-  set(oneValueArgs VERSION REPOSITORY PINNED_TAG EXCLUDE_FROM_ALL)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-          "${multiValueArgs}" ${ARGN} )
+  set(oneValueArgs)
 
-  set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.diff")
-  set(patch_issues_to_ref "fix compile issues")
-  set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/patch.cmake")
-  set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/log")
-  string(TIMESTAMP current_year "%Y" UTC)
-  configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}"
-                @ONLY)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/hnswlib_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(hnswlib version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(hnswlib ${version} patch_command)
 
   rapids_cpm_find(
-    hnswlib ${PKG_VERSION}
-    GLOBAL_TARGETS hnswlib::hnswlib
-    BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports
+    hnswlib ${version}
+    GLOBAL_TARGETS hnswlib hnswlib::hnswlib
     CPM_ARGS
-    GIT_REPOSITORY ${PKG_REPOSITORY}
-    GIT_TAG ${PKG_PINNED_TAG}
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
+    EXCLUDE_FROM_ALL ${exclude}
     DOWNLOAD_ONLY ON
-    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
   )
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(hnswlib)
+
   if(NOT TARGET hnswlib::hnswlib)
     add_library(hnswlib INTERFACE )
     add_library(hnswlib::hnswlib ALIAS hnswlib)
     target_include_directories(hnswlib INTERFACE
      "$<BUILD_INTERFACE:${hnswlib_SOURCE_DIR}>"
      "$<INSTALL_INTERFACE:include>")
+  endif()
 
-    if(NOT PKG_EXCLUDE_FROM_ALL)
-      install(TARGETS hnswlib EXPORT hnswlib-exports)
+  if(hnswlib_ADDED)
+    # write build export rules
+    install(TARGETS hnswlib EXPORT hnswlib-exports)
+    if(NOT exclude)
       install(DIRECTORY "${hnswlib_SOURCE_DIR}/hnswlib/" DESTINATION include/hnswlib)
 
       # write install export rules
       rapids_export(
         INSTALL hnswlib
-        VERSION ${PKG_VERSION}
+        VERSION ${version}
         EXPORT_SET hnswlib-exports
         GLOBAL_TARGETS hnswlib
         NAMESPACE hnswlib::)
     endif()
 
-    # write build export rules
     rapids_export(
       BUILD hnswlib
-      VERSION ${PKG_VERSION}
+      VERSION ${version}
       EXPORT_SET hnswlib-exports
       GLOBAL_TARGETS hnswlib
       NAMESPACE hnswlib::)
 
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    include("${rapids-cmake-dir}/export/package.cmake")
+    rapids_export_package(INSTALL hnswlib raft-exports VERSION ${version} GLOBAL_TARGETS hnswlib hnswlib::hnswlib)
+    rapids_export_package(BUILD hnswlib raft-exports VERSION ${version} GLOBAL_TARGETS hnswlib hnswlib::hnswlib)
+
 
     # When using RAFT from the build dir, ensure hnswlib is also found in RAFT's build dir. This
     # line adds `set(hnswlib_ROOT "${CMAKE_CURRENT_LIST_DIR}")` to build/raft-dependencies.cmake
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
     rapids_export_find_package_root(
       BUILD hnswlib [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET raft-exports
     )
   endif()
 endfunction()
 
-
-if(NOT RAFT_HNSWLIB_GIT_TAG)
-  set(RAFT_HNSWLIB_GIT_TAG v0.6.2)
-endif()
-
-if(NOT RAFT_HNSWLIB_GIT_REPOSITORY)
-  set(RAFT_HNSWLIB_GIT_REPOSITORY https://github.com/nmslib/hnswlib.git)
-endif()
-find_and_configure_hnswlib(VERSION 0.6.2
-        REPOSITORY       ${RAFT_HNSWLIB_GIT_REPOSITORY}
-        PINNED_TAG       ${RAFT_HNSWLIB_GIT_TAG}
-        EXCLUDE_FROM_ALL OFF
-        )
+find_and_configure_hnswlib()
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index 6d3f430e88..0a5a3ba5aa 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -43,15 +43,14 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 #include <thrust/transform.h>
 
 #include <limits>
+#include <optional>
 #include <tuple>
 #include <type_traits>
 
@@ -91,7 +90,7 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
   const MathT* dataset_norm,
   IdxT n_rows,
   LabelT* labels,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto stream = resource::get_cuda_stream(handle);
   switch (params.metric) {
@@ -263,10 +262,9 @@ void calc_centers_and_sizes(const raft::resources& handle,
                             const LabelT* labels,
                             bool reset_counters,
                             MappingOpT mapping_op,
-                            rmm::mr::device_memory_resource* mr = nullptr)
+                            rmm::device_async_resource_ref mr)
 {
   auto stream = resource::get_cuda_stream(handle);
-  if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
 
   if (!reset_counters) {
     raft::linalg::matrixVectorOp(
@@ -322,12 +320,12 @@ void compute_norm(const raft::resources& handle,
                   IdxT dim,
                   IdxT n_rows,
                   MappingOpT mapping_op,
-                  rmm::mr::device_memory_resource* mr = nullptr)
+                  std::optional<rmm::device_async_resource_ref> mr = std::nullopt)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("compute_norm");
   auto stream = resource::get_cuda_stream(handle);
-  if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
-  rmm::device_uvector<MathT> mapped_dataset(0, stream, mr);
+  rmm::device_uvector<MathT> mapped_dataset(
+    0, stream, mr.value_or(resource::get_workspace_resource(handle)));
 
   const MathT* dataset_ptr = nullptr;
 
@@ -338,7 +336,7 @@ void compute_norm(const raft::resources& handle,
 
     linalg::unaryOp(mapped_dataset.data(), dataset, n_rows * dim, mapping_op, stream);
 
-    dataset_ptr = (const MathT*)mapped_dataset.data();
+    dataset_ptr = static_cast<const MathT*>(mapped_dataset.data());
   }
 
   raft::linalg::rowNorm<MathT, IdxT>(
@@ -376,22 +374,22 @@ void predict(const raft::resources& handle,
              IdxT n_rows,
              LabelT* labels,
              MappingOpT mapping_op,
-             rmm::mr::device_memory_resource* mr = nullptr,
-             const MathT* dataset_norm           = nullptr)
+             std::optional<rmm::device_async_resource_ref> mr = std::nullopt,
+             const MathT* dataset_norm                        = nullptr)
 {
   auto stream = resource::get_cuda_stream(handle);
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
+  auto mem_res = mr.value_or(resource::get_workspace_resource(handle));
   auto [max_minibatch_size, _mem_per_row] =
     calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
   rmm::device_uvector<MathT> cur_dataset(
-    std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mr);
+    std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mem_res);
   bool need_compute_norm =
     dataset_norm == nullptr && (params.metric == raft::distance::DistanceType::L2Expanded ||
                                 params.metric == raft::distance::DistanceType::L2SqrtExpanded);
   rmm::device_uvector<MathT> cur_dataset_norm(
-    need_compute_norm ? max_minibatch_size : 0, stream, mr);
+    need_compute_norm ? max_minibatch_size : 0, stream, mem_res);
   const MathT* dataset_norm_ptr = nullptr;
   auto cur_dataset_ptr          = cur_dataset.data();
   for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
@@ -407,7 +405,7 @@ void predict(const raft::resources& handle,
     // Compute the norm now if it hasn't been pre-computed.
     if (need_compute_norm) {
       compute_norm(
-        handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mr);
+        handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mem_res);
       dataset_norm_ptr = cur_dataset_norm.data();
     } else if (dataset_norm != nullptr) {
       dataset_norm_ptr = dataset_norm + offset;
@@ -422,7 +420,7 @@ void predict(const raft::resources& handle,
                  dataset_norm_ptr,
                  minibatch_size,
                  labels + offset,
-                 mr);
+                 mem_res);
   }
 }
 
@@ -530,7 +528,7 @@ auto adjust_centers(MathT* centers,
                     MathT threshold,
                     MappingOpT mapping_op,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* device_memory) -> bool
+                    rmm::device_async_resource_ref device_memory) -> bool
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
@@ -628,7 +626,7 @@ void balancing_em_iters(const raft::resources& handle,
                         uint32_t balancing_pullback,
                         MathT balancing_threshold,
                         MappingOpT mapping_op,
-                        rmm::mr::device_memory_resource* device_memory)
+                        rmm::device_async_resource_ref device_memory)
 {
   auto stream                = resource::get_cuda_stream(handle);
   uint32_t balancing_counter = balancing_pullback;
@@ -711,7 +709,7 @@ void build_clusters(const raft::resources& handle,
                     LabelT* cluster_labels,
                     CounterT* cluster_sizes,
                     MappingOpT mapping_op,
-                    rmm::mr::device_memory_resource* device_memory,
+                    rmm::device_async_resource_ref device_memory,
                     const MathT* dataset_norm = nullptr)
 {
   auto stream = resource::get_cuda_stream(handle);
@@ -853,8 +851,8 @@ auto build_fine_clusters(const raft::resources& handle,
                          IdxT fine_clusters_nums_max,
                          MathT* cluster_centers,
                          MappingOpT mapping_op,
-                         rmm::mr::device_memory_resource* managed_memory,
-                         rmm::mr::device_memory_resource* device_memory) -> IdxT
+                         rmm::device_async_resource_ref managed_memory,
+                         rmm::device_async_resource_ref device_memory) -> IdxT
 {
   auto stream = resource::get_cuda_stream(handle);
   rmm::device_uvector<IdxT> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
@@ -971,7 +969,7 @@ void build_hierarchical(const raft::resources& handle,
 
   // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf.
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::mr::device_memory_resource* device_memory = resource::get_workspace_resource(handle);
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
   auto [max_minibatch_size, mem_per_row] =
     calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
 
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
index 8cd7730814..a1a182608b 100644
--- a/cpp/include/raft/cluster/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -358,7 +358,8 @@ void calc_centers_and_sizes(const raft::resources& handle,
                                  X.extent(0),
                                  labels.data_handle(),
                                  reset_counters,
-                                 mapping_op);
+                                 mapping_op,
+                                 resource::get_workspace_resource(handle));
 }
 
 }  // namespace helpers
diff --git a/cpp/include/raft/cluster/specializations.cuh b/cpp/include/raft/cluster/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/cluster/specializations.cuh
+++ b/cpp/include/raft/cluster/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index dd8fc2d103..239d6e08f6 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please note that there is no equivalent in RAFT's public API"
                 " so this file will eventually be removed altogether.")
+#endif
 
 #include <raft/util/detail/cub_wrappers.cuh>
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index 6c62cd70cc..53724f4ae1 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/util version instead.")
+#endif
 
 #include <raft/util/device_loads_stores.cuh>
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 72de79a596..dcbd46b236 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/matrix version instead.")
+#endif
 
 #include <raft/util/scatter.cuh>
diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp
index 433b032b0f..56b41a41f4 100644
--- a/cpp/include/raft/common/seive.hpp
+++ b/cpp/include/raft/common/seive.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/util version instead.")
+#endif
 
 #include <raft/util/seive.hpp>
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index 6e7ff7106f..cb1accc95e 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -34,6 +34,7 @@
 #include <time.h>
 #include <ucp/api/ucp.h>
 #include <ucp/api/ucp_def.h>
+#include <ucxx/api.h>
 
 #include <algorithm>
 #include <chrono>
@@ -49,6 +50,17 @@ namespace raft {
 namespace comms {
 namespace detail {
 
+using ucp_endpoint_array_t  = std::shared_ptr<ucp_ep_h*>;
+using ucxx_endpoint_array_t = std::shared_ptr<ucxx::Endpoint**>;
+using ucp_worker_t          = ucp_worker_h;
+using ucxx_worker_t         = ucxx::Worker*;
+
+struct ucx_objects_t {
+ public:
+  std::variant<ucp_endpoint_array_t, ucxx_endpoint_array_t> endpoints;
+  std::variant<ucp_worker_t, ucxx_worker_t> worker;
+};
+
 class std_comms : public comms_iface {
  public:
   std_comms() = delete;
@@ -64,8 +76,7 @@ class std_comms : public comms_iface {
    * @param subcomms_ucp use ucp for subcommunicators
    */
   std_comms(ncclComm_t nccl_comm,
-            ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h*> eps,
+            ucx_objects_t ucx_objects,
             int num_ranks,
             int rank,
             rmm::cuda_stream_view stream,
@@ -76,9 +87,8 @@ class std_comms : public comms_iface {
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(subcomms_ucp),
+      ucx_objects_(ucx_objects),
       own_nccl_comm_(false),
-      ucp_worker_(ucp_worker),
-      ucp_eps_(eps),
       next_request_id_(0)
   {
     initialize();
@@ -205,96 +215,209 @@ class std_comms : public comms_iface {
 
   void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
   {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+    if (std::holds_alternative<ucxx_worker_t>(ucx_objects_.worker)) {
+      get_request_id(request);
 
-    get_request_id(request);
-    ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
+      ucxx::Endpoint* ep_ptr = (*std::get<ucxx_endpoint_array_t>(ucx_objects_.endpoints))[dest];
 
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+      ucp_tag_t ucp_tag = build_message_tag(get_rank(), tag);
+      auto ucxx_req     = ep_ptr->tagSend(const_cast<void*>(buf), size, ucxx::Tag(ucp_tag));
 
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
+      requests_in_flight_.insert(std::make_pair(*request, ucxx_req));
+    } else {
+      ASSERT(std::get<ucp_worker_t>(ucx_objects_.worker) != nullptr,
+             "ERROR: UCX comms not initialized on communicator.");
 
-    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
-  }
+      get_request_id(request);
+      ucp_ep_h ep_ptr = (*std::get<ucp_endpoint_array_t>(ucx_objects_.endpoints))[dest];
 
-  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+      ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
 
-    get_request_id(request);
+      this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
 
-    ucp_ep_h ep_ptr = (*ucp_eps_)[source];
-
-    ucp_tag_t tag_mask = default_tag_mask;
-
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
-
-    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+      requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+    }
   }
 
-  void waitall(int count, request_t array_of_requests[]) const
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
   {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+    if (std::holds_alternative<ucxx_worker_t>(ucx_objects_.worker)) {
+      get_request_id(request);
 
-    std::vector<ucp_request*> requests;
-    requests.reserve(count);
+      ucxx::Endpoint* ep_ptr = (*std::get<ucxx_endpoint_array_t>(ucx_objects_.endpoints))[source];
 
-    time_t start = time(NULL);
+      ucp_tag_t ucp_tag = build_message_tag(get_rank(), tag);
+      auto ucxx_req =
+        ep_ptr->tagRecv(buf, size, ucxx::Tag(ucp_tag), ucxx::TagMask(default_tag_mask));
 
-    for (int i = 0; i < count; ++i) {
-      auto req_it = requests_in_flight_.find(array_of_requests[i]);
-      ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d",
-             array_of_requests[i]);
-      requests.push_back(req_it->second);
-      free_requests_.insert(req_it->first);
-      requests_in_flight_.erase(req_it);
-    }
-
-    while (requests.size() > 0) {
-      time_t now = time(NULL);
+      requests_in_flight_.insert(std::make_pair(*request, ucxx_req));
+    } else {
+      ASSERT(std::get<ucp_worker_t>(ucx_objects_.worker) != nullptr,
+             "ERROR: UCX comms not initialized on communicator.");
 
-      // Timeout if we have not gotten progress or completed any requests
-      // in 10 or more seconds.
-      ASSERT(now - start < 10, "Timed out waiting for requests.");
+      get_request_id(request);
 
-      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
-        bool restart = false;  // resets the timeout when any progress was made
+      ucp_ep_h ep_ptr = (*std::get<ucp_endpoint_array_t>(ucx_objects_.endpoints))[source];
 
-        // Causes UCP to progress through the send/recv message queue
-        while (ucp_worker_progress(ucp_worker_) != 0) {
-          restart = true;
-        }
+      ucp_tag_t tag_mask = default_tag_mask;
 
-        auto req = *it;
+      ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+      ucp_handler_.ucp_irecv(ucp_req,
+                             std::get<ucp_worker_t>(ucx_objects_.worker),
+                             ep_ptr,
+                             buf,
+                             size,
+                             tag,
+                             tag_mask,
+                             source);
 
-        // If the message needs release, we know it will be sent/received
-        // asynchronously, so we will need to track and verify its state
-        if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
-          ASSERT(req->req->completed == 1 || req->req->completed == 0,
-                 "request->completed not a valid value: %d\n",
-                 req->req->completed);
-        }
+      requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+    }
+  }
 
-        // If a message was sent synchronously (eg. completed before
-        // `isend`/`irecv` completed) or an asynchronous message
-        // is complete, we can go ahead and clean it up.
-        if (!req->needs_release || req->req->completed == 1) {
-          restart = true;
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    if (std::holds_alternative<ucxx_worker_t>(ucx_objects_.worker)) {
+      ucxx_worker_t worker = std::get<ucxx_worker_t>(ucx_objects_.worker);
+
+      std::vector<std::shared_ptr<ucxx::Request>> requests;
+      requests.reserve(count);
+
+      time_t start = time(NULL);
+
+      for (int i = 0; i < count; ++i) {
+        auto req_it = requests_in_flight_.find(array_of_requests[i]);
+        ASSERT(requests_in_flight_.end() != req_it,
+               "ERROR: waitall on invalid request: %d",
+               array_of_requests[i]);
+        requests.push_back(std::get<std::shared_ptr<ucxx::Request>>(req_it->second));
+        free_requests_.insert(req_it->first);
+        requests_in_flight_.erase(req_it);
+      }
 
-          // perform cleanup
-          ucp_handler_.free_ucp_request(req);
+      while (requests.size() > 0) {
+        time_t now = time(NULL);
+
+        // Timeout if we have not gotten progress or completed any requests
+        // in 10 or more seconds.
+        ASSERT(now - start < 10, "Timed out waiting for requests.");
+
+        for (std::vector<std::shared_ptr<ucxx::Request>>::iterator it = requests.begin();
+             it != requests.end();) {
+          bool restart = false;  // resets the timeout when any progress was made
+
+          if (worker->isProgressThreadRunning()) {
+            // Wait for a UCXX progress thread roundtrip
+            ucxx::utils::CallbackNotifier callbackNotifierPre{};
+            worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); });
+            callbackNotifierPre.wait();
+
+            ucxx::utils::CallbackNotifier callbackNotifierPost{};
+            worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); });
+            callbackNotifierPost.wait();
+          } else {
+            // Causes UCXX to progress through the send/recv message queue
+            while (!worker->progress()) {
+              restart = true;
+            }
+          }
+
+          auto req = *it;
+
+          // If the message needs release, we know it will be sent/received
+          // asynchronously, so we will need to track and verify its state
+          if (req->isCompleted()) {
+            auto status = req->getStatus();
+            ASSERT(req->getStatus() == UCS_OK,
+                   "UCX Request Error: %d (%s)\n",
+                   status,
+                   ucs_status_string(status));
+          }
+
+          // If a message was sent synchronously (eg. completed before
+          // `isend`/`irecv` completed) or an asynchronous message
+          // is complete, we can go ahead and clean it up.
+          if (req->isCompleted()) {
+            restart = true;
+
+            auto status = req->getStatus();
+            ASSERT(req->getStatus() == UCS_OK,
+                   "UCX Request Error: %d (%s)\n",
+                   status,
+                   ucs_status_string(status));
+
+            // remove from pending requests
+            it = requests.erase(it);
+          } else {
+            ++it;
+          }
+          // if any progress was made, reset the timeout start time
+          if (restart) { start = time(NULL); }
+        }
+      }
+    } else {
+      ucp_worker_t worker = std::get<ucp_worker_t>(ucx_objects_.worker);
+      ASSERT(worker != nullptr, "ERROR: UCX comms not initialized on communicator.");
+
+      std::vector<ucp_request*> requests;
+      requests.reserve(count);
+
+      time_t start = time(NULL);
+
+      for (int i = 0; i < count; ++i) {
+        auto req_it = requests_in_flight_.find(array_of_requests[i]);
+        ASSERT(requests_in_flight_.end() != req_it,
+               "ERROR: waitall on invalid request: %d",
+               array_of_requests[i]);
+        requests.push_back(std::get<ucp_request*>(req_it->second));
+        free_requests_.insert(req_it->first);
+        requests_in_flight_.erase(req_it);
+      }
 
-          // remove from pending requests
-          it = requests.erase(it);
-        } else {
-          ++it;
+      while (requests.size() > 0) {
+        time_t now = time(NULL);
+
+        // Timeout if we have not gotten progress or completed any requests
+        // in 10 or more seconds.
+        ASSERT(now - start < 10, "Timed out waiting for requests.");
+
+        for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
+          bool restart = false;  // resets the timeout when any progress was made
+
+          // Causes UCP to progress through the send/recv message queue
+          while (ucp_worker_progress(worker) != 0) {
+            restart = true;
+          }
+
+          auto req = *it;
+
+          // If the message needs release, we know it will be sent/received
+          // asynchronously, so we will need to track and verify its state
+          if (req->needs_release) {
+            ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
+            ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
+            ASSERT(req->req->completed == 1 || req->req->completed == 0,
+                   "request->completed not a valid value: %d\n",
+                   req->req->completed);
+          }
+
+          // If a message was sent synchronously (eg. completed before
+          // `isend`/`irecv` completed) or an asynchronous message
+          // is complete, we can go ahead and clean it up.
+          if (!req->needs_release || req->req->completed == 1) {
+            restart = true;
+
+            // perform cleanup
+            ucp_handler_.free_ucp_request(req);
+
+            // remove from pending requests
+            it = requests.erase(it);
+          } else {
+            ++it;
+          }
+          // if any progress was made, reset the timeout start time
+          if (restart) { start = time(NULL); }
         }
-        // if any progress was made, reset the timeout start time
-        if (restart) { start = time(NULL); }
       }
     }
   }
@@ -524,10 +647,11 @@ class std_comms : public comms_iface {
   bool own_nccl_comm_;
 
   comms_ucp_handler ucp_handler_;
-  ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h*> ucp_eps_;
+  ucx_objects_t ucx_objects_;
   mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
+  mutable std::unordered_map<request_t,
+                             std::variant<struct ucp_request*, std::shared_ptr<ucxx::Request>>>
+    requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
 };
 }  // namespace detail
diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp
index 5896248c1d..65e1957e54 100644
--- a/cpp/include/raft/comms/detail/ucp_helper.hpp
+++ b/cpp/include/raft/comms/detail/ucp_helper.hpp
@@ -46,9 +46,7 @@ struct ucx_context {
 class ucp_request {
  public:
   struct ucx_context* req;
-  bool needs_release   = true;
-  int other_rank       = -1;
-  bool is_send_request = false;
+  bool needs_release = true;
 };
 
 // by default, match the whole tag
@@ -72,17 +70,16 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_
   context->completed          = 1;
 }
 
+ucp_tag_t build_message_tag(int rank, int tag)
+{
+  // keeping the rank in the lower bits enables debugging.
+  return ((uint32_t)tag << 31) | (uint32_t)rank;
+}
+
 /**
  * Helper class for interacting with ucp.
  */
 class comms_ucp_handler {
- private:
-  ucp_tag_t build_message_tag(int rank, int tag) const
-  {
-    // keeping the rank in the lower bits enables debugging.
-    return ((uint32_t)tag << 31) | (uint32_t)rank;
-  }
-
  public:
   /**
    * @brief Frees any memory underlying the given ucp request object
@@ -132,9 +129,7 @@ class comms_ucp_handler {
       req->needs_release = false;
     }
 
-    req->other_rank      = rank;
-    req->is_send_request = true;
-    req->req             = ucp_req;
+    req->req = ucp_req;
   }
 
   /**
@@ -156,10 +151,8 @@ class comms_ucp_handler {
 
     struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
 
-    req->req             = ucp_req;
-    req->needs_release   = true;
-    req->is_send_request = false;
-    req->other_rank      = sender_rank;
+    req->req           = ucp_req;
+    req->needs_release = true;
 
     ASSERT(!UCS_PTR_IS_ERR(recv_result),
            "unable to receive UCX data message (%d)\n",
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index c81b19c9ba..667c8be285 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -24,6 +24,7 @@
 
 #include <nccl.h>
 #include <ucp/api/ucp.h>
+#include <ucxx/api.h>
 
 #include <iostream>
 
@@ -81,6 +82,8 @@ void build_comms_nccl_only(resources* handle, ncclComm_t nccl_comm, int num_rank
  *
  * @param handle raft::resources for injecting the comms
  * @param nccl_comm initialized NCCL communicator to use for collectives
+ * @param is_ucxx whether `ucp_worker` and `eps` objects are UCXX (true) or
+ *                pure UCX (false).
  * @param ucp_worker of local process
  *        Note: This is purposefully left as void* so that the ucp_worker_h
  *        doesn't need to be exposed through the cython layer
@@ -112,30 +115,55 @@ void build_comms_nccl_only(resources* handle, ncclComm_t nccl_comm, int num_rank
  * comm.sync_stream(resource::get_cuda_stream(handle));
  * @endcode
  */
-void build_comms_nccl_ucx(
-  resources* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+void build_comms_nccl_ucx(resources* handle,
+                          ncclComm_t nccl_comm,
+                          bool is_ucxx,
+                          void* ucp_worker,
+                          void* eps,
+                          int num_ranks,
+                          int rank)
 {
-  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
+  detail::ucx_objects_t ucx_objects;
+  if (is_ucxx) {
+    ucx_objects.endpoints = std::make_shared<ucxx::Endpoint**>(new ucxx::Endpoint*[num_ranks]);
+    ucx_objects.worker    = static_cast<ucxx::Worker*>(ucp_worker);
+  } else {
+    ucx_objects.endpoints = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
+    ucx_objects.worker    = static_cast<ucp_worker_h>(ucp_worker);
+  }
 
   auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
 
   for (int i = 0; i < num_ranks; i++) {
-    size_t ptr    = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
-
-    if (ptr != 0) {
-      auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i]  = eps_ptr;
+    size_t ptr = size_t_ep_arr[i];
+
+    if (is_ucxx) {
+      auto ucp_ep_v = reinterpret_cast<ucxx::Endpoint**>(
+        *std::get<detail::ucxx_endpoint_array_t>(ucx_objects.endpoints));
+
+      if (ptr != 0) {
+        auto eps_ptr = reinterpret_cast<ucxx::Endpoint*>(size_t_ep_arr[i]);
+        ucp_ep_v[i]  = eps_ptr;
+      } else {
+        ucp_ep_v[i] = nullptr;
+      }
     } else {
-      ucp_ep_v[i] = nullptr;
+      auto ucp_ep_v =
+        reinterpret_cast<ucp_ep_h*>(*std::get<detail::ucp_endpoint_array_t>(ucx_objects.endpoints));
+
+      if (ptr != 0) {
+        auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
+        ucp_ep_v[i]  = eps_ptr;
+      } else {
+        ucp_ep_v[i] = nullptr;
+      }
     }
   }
 
   cudaStream_t stream = resource::get_cuda_stream(*handle);
 
-  auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
-      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
+  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
+    new raft::comms::std_comms(nccl_comm, ucx_objects, num_ranks, rank, stream)));
   resource::set_comms(*handle, communicator);
 }
 
diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index 829c84ed25..2c23a77e47 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -16,112 +16,30 @@
 
 #pragma once
 
+#include <raft/core/bitmap.hpp>
 #include <raft/core/bitset.cuh>
 #include <raft/core/detail/mdspan_util.cuh>
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resources.hpp>
 
-namespace raft::core {
-/**
- * @defgroup bitmap Bitmap
- * @{
- */
-/**
- * @brief View of a RAFT Bitmap.
- *
- * This lightweight structure which represents and manipulates a two-dimensional bitmap matrix view
- * with row major order. This class provides functionality for handling a matrix where each element
- * is represented as a bit in a bitmap.
- *
- * @tparam bitmap_t Underlying type of the bitmap array. Default is uint32_t.
- * @tparam index_t Indexing type used. Default is uint32_t.
- */
-template <typename bitmap_t = uint32_t, typename index_t = uint32_t>
-struct bitmap_view : public bitset_view<bitmap_t, index_t> {
-  static_assert((std::is_same<bitmap_t, uint32_t>::value ||
-                 std::is_same<bitmap_t, uint64_t>::value),
-                "The bitmap_t must be uint32_t or uint64_t.");
-  /**
-   * @brief Create a bitmap view from a device raw pointer.
-   *
-   * @param bitmap_ptr Device raw pointer
-   * @param rows Number of row in the matrix.
-   * @param cols Number of col in the matrix.
-   */
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
-  {
-  }
-
-  /**
-   * @brief Create a bitmap view from a device vector view of the bitset.
-   *
-   * @param bitmap_span Device vector view of the bitmap
-   * @param rows Number of row in the matrix.
-   * @param cols Number of col in the matrix.
-   */
-  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
-                                index_t rows,
-                                index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
-  {
-  }
+#include <type_traits>
 
- private:
-  // Hide the constructors of bitset_view.
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t bitmap_len)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, bitmap_len)
-  {
-  }
-
-  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
-                                index_t bitmap_len)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, bitmap_len)
-  {
-  }
-
- public:
-  /**
-   * @brief Device function to test if a given row and col are set in the bitmap.
-   *
-   * @param row Row index of the bit to test
-   * @param col Col index of the bit to test
-   * @return bool True if index has not been unset in the bitset
-   */
-  inline _RAFT_DEVICE auto test(const index_t row, const index_t col) const -> bool
-  {
-    return test(row * cols_ + col);
-  }
-
-  /**
-   * @brief Device function to set a given row and col to set_value in the bitset.
-   *
-   * @param row Row index of the bit to set
-   * @param col Col index of the bit to set
-   * @param new_value Value to set the bit to (true or false)
-   */
-  inline _RAFT_DEVICE void set(const index_t row, const index_t col, bool new_value) const
-  {
-    set(row * cols_ + col, &new_value);
-  }
-
-  /**
-   * @brief Get the total number of rows
-   * @return index_t The total number of rows
-   */
-  inline _RAFT_HOST_DEVICE index_t get_n_rows() const { return rows_; }
-
-  /**
-   * @brief Get the total number of columns
-   * @return index_t The total number of columns
-   */
-  inline _RAFT_HOST_DEVICE index_t get_n_cols() const { return cols_; }
+namespace raft::core {
 
- private:
-  index_t rows_;
-  index_t cols_;
-};
+template <typename bitmap_t, typename index_t>
+_RAFT_HOST_DEVICE inline bool bitmap_view<bitmap_t, index_t>::test(const index_t row,
+                                                                   const index_t col) const
+{
+  return test(row * cols_ + col);
+}
+
+template <typename bitmap_t, typename index_t>
+_RAFT_HOST_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
+                                                           const index_t col,
+                                                           bool new_value) const
+{
+  set(row * cols_ + col, &new_value);
+}
 
-/** @} */
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
new file mode 100644
index 0000000000..5c77866164
--- /dev/null
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/bitset.hpp>
+#include <raft/core/detail/mdspan_util.cuh>
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+#include <type_traits>
+
+namespace raft::core {
+/**
+ * @defgroup bitmap Bitmap
+ * @{
+ */
+/**
+ * @brief View of a RAFT Bitmap.
+ *
+ * This lightweight structure which represents and manipulates a two-dimensional bitmap matrix view
+ * with row major order. This class provides functionality for handling a matrix where each element
+ * is represented as a bit in a bitmap.
+ *
+ * @tparam bitmap_t Underlying type of the bitmap array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitmap_t = uint32_t, typename index_t = uint32_t>
+struct bitmap_view : public bitset_view<bitmap_t, index_t> {
+  static_assert((std::is_same<typename std::remove_const<bitmap_t>::type, uint32_t>::value ||
+                 std::is_same<typename std::remove_const<bitmap_t>::type, uint64_t>::value),
+                "The bitmap_t must be uint32_t or uint64_t.");
+  /**
+   * @brief Create a bitmap view from a device raw pointer.
+   *
+   * @param bitmap_ptr Device raw pointer
+   * @param rows Number of row in the matrix.
+   * @param cols Number of col in the matrix.
+   */
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
+  {
+  }
+
+  /**
+   * @brief Create a bitmap view from a device vector view of the bitset.
+   *
+   * @param bitmap_span Device vector view of the bitmap
+   * @param rows Number of row in the matrix.
+   * @param cols Number of col in the matrix.
+   */
+  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
+                                index_t rows,
+                                index_t cols)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
+  {
+  }
+
+ private:
+  // Hide the constructors of bitset_view.
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t bitmap_len)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, bitmap_len)
+  {
+  }
+
+  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
+                                index_t bitmap_len)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, bitmap_len)
+  {
+  }
+
+ public:
+  /**
+   * @brief Device function to test if a given row and col are set in the bitmap.
+   *
+   * @param row Row index of the bit to test
+   * @param col Col index of the bit to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_HOST_DEVICE bool test(const index_t row, const index_t col) const;
+
+  /**
+   * @brief Device function to set a given row and col to set_value in the bitset.
+   *
+   * @param row Row index of the bit to set
+   * @param col Col index of the bit to set
+   * @param new_value Value to set the bit to (true or false)
+   */
+  inline _RAFT_HOST_DEVICE void set(const index_t row, const index_t col, bool new_value) const;
+
+  /**
+   * @brief Get the total number of rows
+   * @return index_t The total number of rows
+   */
+  inline _RAFT_HOST_DEVICE index_t get_n_rows() const { return rows_; }
+
+  /**
+   * @brief Get the total number of columns
+   * @return index_t The total number of columns
+   */
+  inline _RAFT_HOST_DEVICE index_t get_n_cols() const { return cols_; }
+
+ private:
+  index_t rows_;
+  index_t cols_;
+};
+
+/** @} */
+}  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index 53fd586ed2..d7eedee92e 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -16,7 +16,8 @@
 
 #pragma once
 
-#include <raft/core/detail/mdspan_util.cuh>  // native_popc
+#include <raft/core/bitset.hpp>
+#include <raft/core/detail/popc.cuh>
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
@@ -28,372 +29,147 @@
 #include <thrust/for_each.h>
 
 namespace raft::core {
-/**
- * @defgroup bitset Bitset
- * @{
- */
-/**
- * @brief View of a RAFT Bitset.
- *
- * This lightweight structure stores a pointer to a bitset in device memory with it's length.
- * It provides a test() device function to check if a given index is set in the bitset.
- *
- * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
- * @tparam index_t Indexing type used. Default is uint32_t.
- */
-template <typename bitset_t = uint32_t, typename index_t = uint32_t>
-struct bitset_view {
-  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
-
-  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
-    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
-  {
-  }
-  /**
-   * @brief Create a bitset view from a device vector view of the bitset.
-   *
-   * @param bitset_span Device vector view of the bitset
-   * @param bitset_len Number of bits in the bitset
-   */
-  _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
-                                index_t bitset_len)
-    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
-  {
-  }
-  /**
-   * @brief Device function to test if a given index is set in the bitset.
-   *
-   * @param sample_index Single index to test
-   * @return bool True if index has not been unset in the bitset
-   */
-  inline _RAFT_DEVICE auto test(const index_t sample_index) const -> bool
-  {
-    const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
-    const index_t bit_index    = sample_index % bitset_element_size;
-    const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
-    return is_bit_set;
-  }
-  /**
-   * @brief Device function to test if a given index is set in the bitset.
-   *
-   * @param sample_index Single index to test
-   * @return bool True if index has not been unset in the bitset
-   */
-  inline _RAFT_DEVICE auto operator[](const index_t sample_index) const -> bool
-  {
-    return test(sample_index);
-  }
-  /**
-   * @brief Device function to set a given index to set_value in the bitset.
-   *
-   * @param sample_index index to set
-   * @param set_value Value to set the bit to (true or false)
-   */
-  inline _RAFT_DEVICE void set(const index_t sample_index, bool set_value) const
-  {
-    const index_t bit_element = sample_index / bitset_element_size;
-    const index_t bit_index   = sample_index % bitset_element_size;
-    const bitset_t bitmask    = bitset_t{1} << bit_index;
-    if (set_value) {
-      atomicOr(bitset_ptr_ + bit_element, bitmask);
-    } else {
-      const bitset_t bitmask2 = ~bitmask;
-      atomicAnd(bitset_ptr_ + bit_element, bitmask2);
-    }
-  }
-
-  /**
-   * @brief Get the device pointer to the bitset.
-   */
-  inline _RAFT_HOST_DEVICE auto data() -> bitset_t* { return bitset_ptr_; }
-  inline _RAFT_HOST_DEVICE auto data() const -> const bitset_t* { return bitset_ptr_; }
-  /**
-   * @brief Get the number of bits of the bitset representation.
-   */
-  inline _RAFT_HOST_DEVICE auto size() const -> index_t { return bitset_len_; }
-
-  /**
-   * @brief Get the number of elements used by the bitset representation.
-   */
-  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t
-  {
-    return raft::ceildiv(bitset_len_, bitset_element_size);
-  }
-
-  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<bitset_t, index_t>(bitset_ptr_, n_elements());
-  }
-  inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
-  }
-
- private:
-  bitset_t* bitset_ptr_;
-  index_t bitset_len_;
-};
-
-/**
- * @brief RAFT Bitset.
- *
- * This structure encapsulates a bitset in device memory. It provides a view() method to get a
- * device-usable lightweight view of the bitset.
- * Each index is represented by a single bit in the bitset. The total number of bytes used is
- * ceil(bitset_len / 8).
- * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
- * @tparam index_t Indexing type used. Default is uint32_t.
- */
-template <typename bitset_t = uint32_t, typename index_t = uint32_t>
-struct bitset {
-  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
-
-  /**
-   * @brief Construct a new bitset object with a list of indices to unset.
-   *
-   * @param res RAFT resources
-   * @param mask_index List of indices to unset in the bitset
-   * @param bitset_len Length of the bitset
-   * @param default_value Default value to set the bits to. Default is true.
-   */
-  bitset(const raft::resources& res,
-         raft::device_vector_view<const index_t, index_t> mask_index,
-         index_t bitset_len,
-         bool default_value = true)
-    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
-              raft::resource::get_cuda_stream(res)},
-      bitset_len_{bitset_len}
-  {
-    reset(res, default_value);
-    set(res, mask_index, !default_value);
-  }
 
-  /**
-   * @brief Construct a new bitset object
-   *
-   * @param res RAFT resources
-   * @param bitset_len Length of the bitset
-   * @param default_value Default value to set the bits to. Default is true.
-   */
-  bitset(const raft::resources& res, index_t bitset_len, bool default_value = true)
-    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
-              resource::get_cuda_stream(res)},
-      bitset_len_{bitset_len}
-  {
-    reset(res, default_value);
-  }
-  // Disable copy constructor
-  bitset(const bitset&)            = delete;
-  bitset(bitset&&)                 = default;
-  bitset& operator=(const bitset&) = delete;
-  bitset& operator=(bitset&&)      = default;
-
-  /**
-   * @brief Create a device-usable view of the bitset.
-   *
-   * @return bitset_view<bitset_t, index_t>
-   */
-  inline auto view() -> raft::core::bitset_view<bitset_t, index_t>
-  {
-    return bitset_view<bitset_t, index_t>(to_mdspan(), bitset_len_);
-  }
-  [[nodiscard]] inline auto view() const -> raft::core::bitset_view<const bitset_t, index_t>
-  {
-    return bitset_view<const bitset_t, index_t>(to_mdspan(), bitset_len_);
-  }
-
-  /**
-   * @brief Get the device pointer to the bitset.
-   */
-  inline auto data() -> bitset_t* { return bitset_.data(); }
-  inline auto data() const -> const bitset_t* { return bitset_.data(); }
-  /**
-   * @brief Get the number of bits of the bitset representation.
-   */
-  inline auto size() const -> index_t { return bitset_len_; }
-
-  /**
-   * @brief Get the number of elements used by the bitset representation.
-   */
-  inline auto n_elements() const -> index_t
-  {
-    return raft::ceildiv(bitset_len_, bitset_element_size);
-  }
-
-  /** @brief Get an mdspan view of the current bitset */
-  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<bitset_t, index_t>(bitset_.data(), n_elements());
-  }
-  [[nodiscard]] inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
-  }
-
-  /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to
-   * the default value.
-   * @param res RAFT resources
-   * @param new_bitset_len new size of the bitset
-   * @param default_value default value to initialize the new bits to
-   */
-  void resize(const raft::resources& res, index_t new_bitset_len, bool default_value = true)
-  {
-    auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
-    auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
-    bitset_.resize(new_size);
-    bitset_len_ = new_bitset_len;
-    if (old_size < new_size) {
-      // If the new size is larger, set the new bits to the default value
-
-      thrust::fill_n(resource::get_thrust_policy(res),
-                     bitset_.data() + old_size,
-                     new_size - old_size,
-                     default_value ? ~bitset_t{0} : bitset_t{0});
-    }
-  }
-
-  /**
-   * @brief Test a list of indices in a bitset.
-   *
-   * @tparam output_t Output type of the test. Default is bool.
-   * @param res RAFT resources
-   * @param queries List of indices to test
-   * @param output List of outputs
-   */
-  template <typename output_t = bool>
-  void test(const raft::resources& res,
-            raft::device_vector_view<const index_t, index_t> queries,
-            raft::device_vector_view<output_t, index_t> output) const
-  {
-    RAFT_EXPECTS(output.extent(0) == queries.extent(0), "Output and queries must be same size");
-    auto bitset_view = view();
-    raft::linalg::map(
-      res,
-      output,
-      [bitset_view] __device__(index_t query) { return output_t(bitset_view.test(query)); },
-      queries);
-  }
-  /**
-   * @brief Set a list of indices in a bitset to set_value.
-   *
-   * @param res RAFT resources
-   * @param mask_index indices to remove from the bitset
-   * @param set_value Value to set the bits to (true or false)
-   */
-  void set(const raft::resources& res,
-           raft::device_vector_view<const index_t, index_t> mask_index,
-           bool set_value = false)
-  {
-    auto this_bitset_view = view();
-    thrust::for_each_n(resource::get_thrust_policy(res),
-                       mask_index.data_handle(),
-                       mask_index.extent(0),
-                       [this_bitset_view, set_value] __device__(const index_t sample_index) {
-                         this_bitset_view.set(sample_index, set_value);
-                       });
-  }
-  /**
-   * @brief Flip all the bits in a bitset.
-   * @param res RAFT resources
-   */
-  void flip(const raft::resources& res)
-  {
-    auto bitset_span = this->to_mdspan();
-    raft::linalg::map(
-      res,
-      bitset_span,
-      [] __device__(bitset_t element) { return bitset_t(~element); },
-      raft::make_const_mdspan(bitset_span));
-  }
-  /**
-   * @brief Reset the bits in a bitset.
-   *
-   * @param res RAFT resources
-   * @param default_value Value to set the bits to (true or false)
-   */
-  void reset(const raft::resources& res, bool default_value = true)
-  {
-    thrust::fill_n(resource::get_thrust_policy(res),
-                   bitset_.data(),
-                   n_elements(),
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE inline bool bitset_view<bitset_t, index_t>::test(const index_t sample_index) const
+{
+  const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
+  const index_t bit_index    = sample_index % bitset_element_size;
+  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+  return is_bit_set;
+}
+
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE bool bitset_view<bitset_t, index_t>::operator[](const index_t sample_index) const
+{
+  return test(sample_index);
+}
+
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
+                                                           bool set_value) const
+{
+  const index_t bit_element = sample_index / bitset_element_size;
+  const index_t bit_index   = sample_index % bitset_element_size;
+  const bitset_t bitmask    = bitset_t{1} << bit_index;
+  if (set_value) {
+    atomicOr(bitset_ptr_ + bit_element, bitmask);
+  } else {
+    const bitset_t bitmask2 = ~bitmask;
+    atomicAnd(bitset_ptr_ + bit_element, bitmask2);
+  }
+}
+
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE inline index_t bitset_view<bitset_t, index_t>::n_elements() const
+{
+  return raft::ceildiv(bitset_len_, bitset_element_size);
+}
+
+template <typename bitset_t, typename index_t>
+bitset<bitset_t, index_t>::bitset(const raft::resources& res,
+                                  raft::device_vector_view<const index_t, index_t> mask_index,
+                                  index_t bitset_len,
+                                  bool default_value)
+  : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+            raft::resource::get_cuda_stream(res)},
+    bitset_len_{bitset_len}
+{
+  reset(res, default_value);
+  set(res, mask_index, !default_value);
+}
+
+template <typename bitset_t, typename index_t>
+bitset<bitset_t, index_t>::bitset(const raft::resources& res,
+                                  index_t bitset_len,
+                                  bool default_value)
+  : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+            raft::resource::get_cuda_stream(res)},
+    bitset_len_{bitset_len}
+{
+  reset(res, default_value);
+}
+
+template <typename bitset_t, typename index_t>
+index_t bitset<bitset_t, index_t>::n_elements() const
+{
+  return raft::ceildiv(bitset_len_, bitset_element_size);
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::resize(const raft::resources& res,
+                                       index_t new_bitset_len,
+                                       bool default_value)
+{
+  auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
+  auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
+  bitset_.resize(new_size);
+  bitset_len_ = new_bitset_len;
+  if (old_size < new_size) {
+    // If the new size is larger, set the new bits to the default value
+    thrust::fill_n(raft::resource::get_thrust_policy(res),
+                   bitset_.data() + old_size,
+                   new_size - old_size,
                    default_value ? ~bitset_t{0} : bitset_t{0});
   }
-  /**
-   * @brief Returns the number of bits set to true in count_gpu_scalar.
-   *
-   * @param[in] res RAFT resources
-   * @param[out] count_gpu_scalar Device scalar to store the count
-   */
-  void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar)
-  {
-    auto n_elements_ = n_elements();
-    auto count_gpu =
-      raft::make_device_vector_view<index_t, index_t>(count_gpu_scalar.data_handle(), 1);
-    auto bitset_matrix_view = raft::make_device_matrix_view<const bitset_t, index_t, col_major>(
-      bitset_.data(), n_elements_, 1);
-
-    bitset_t n_last_element = (bitset_len_ % bitset_element_size);
-    bitset_t last_element_mask =
-      n_last_element ? (bitset_t)((bitset_t{1} << n_last_element) - bitset_t{1}) : ~bitset_t{0};
-    raft::linalg::coalesced_reduction(
-      res,
-      bitset_matrix_view,
-      count_gpu,
-      index_t{0},
-      false,
-      [last_element_mask, n_elements_] __device__(bitset_t element, index_t index) {
-        index_t result = 0;
-        if constexpr (bitset_element_size == 64) {
-          if (index == n_elements_ - 1)
-            result = index_t(raft::detail::popc(element & last_element_mask));
-          else
-            result = index_t(raft::detail::popc(element));
-        } else {  // Needed because popc is not overloaded for 16 and 8 bit elements
-          if (index == n_elements_ - 1)
-            result = index_t(raft::detail::popc(uint32_t{element} & last_element_mask));
-          else
-            result = index_t(raft::detail::popc(uint32_t{element}));
-        }
-
-        return result;
-      });
-  }
-  /**
-   * @brief Returns the number of bits set to true.
-   *
-   * @param res RAFT resources
-   * @return index_t Number of bits set to true
-   */
-  auto count(const raft::resources& res) -> index_t
-  {
-    auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
-    count(res, count_gpu_scalar.view());
-    index_t count_cpu = 0;
-    raft::update_host(
-      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
-    resource::sync_stream(res);
-    return count_cpu;
-  }
-  /**
-   * @brief Checks if any of the bits are set to true in the bitset.
-   * @param res RAFT resources
-   */
-  bool any(const raft::resources& res) { return count(res) > 0; }
-  /**
-   * @brief Checks if all of the bits are set to true in the bitset.
-   * @param res RAFT resources
-   */
-  bool all(const raft::resources& res) { return count(res) == bitset_len_; }
-  /**
-   * @brief Checks if none of the bits are set to true in the bitset.
-   * @param res RAFT resources
-   */
-  bool none(const raft::resources& res) { return count(res) == 0; }
-
- private:
-  raft::device_uvector<bitset_t> bitset_;
-  index_t bitset_len_;
-};
+}
+
+template <typename bitset_t, typename index_t>
+template <typename output_t>
+void bitset<bitset_t, index_t>::test(const raft::resources& res,
+                                     raft::device_vector_view<const index_t, index_t> queries,
+                                     raft::device_vector_view<output_t, index_t> output) const
+{
+  RAFT_EXPECTS(output.extent(0) == queries.extent(0), "Output and queries must be same size");
+  auto bitset_view = view();
+  raft::linalg::map(
+    res,
+    output,
+    [bitset_view] __device__(index_t query) { return bitset_view.test(query); },
+    queries);
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::set(const raft::resources& res,
+                                    raft::device_vector_view<const index_t, index_t> mask_index,
+                                    bool set_value)
+{
+  auto this_bitset_view = view();
+  thrust::for_each_n(raft::resource::get_thrust_policy(res),
+                     mask_index.data_handle(),
+                     mask_index.extent(0),
+                     [this_bitset_view, set_value] __device__(const index_t sample_index) {
+                       this_bitset_view.set(sample_index, set_value);
+                     });
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::flip(const raft::resources& res)
+{
+  auto bitset_span = this->to_mdspan();
+  raft::linalg::map(
+    res,
+    bitset_span,
+    [] __device__(bitset_t element) { return bitset_t(~element); },
+    raft::make_const_mdspan(bitset_span));
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::reset(const raft::resources& res, bool default_value)
+{
+  thrust::fill_n(raft::resource::get_thrust_policy(res),
+                 bitset_.data(),
+                 n_elements(),
+                 default_value ? ~bitset_t{0} : bitset_t{0});
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::count(const raft::resources& res,
+                                      raft::device_scalar_view<index_t> count_gpu_scalar)
+{
+  auto values =
+    raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
+  raft::detail::popc(res, values, bitset_len_, count_gpu_scalar);
+}
 
-/** @} */
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
new file mode 100644
index 0000000000..0df12f25e6
--- /dev/null
+++ b/cpp/include/raft/core/bitset.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::core {
+/**
+ * @defgroup bitset Bitset
+ * @{
+ */
+/**
+ * @brief View of a RAFT Bitset.
+ *
+ * This lightweight structure stores a pointer to a bitset in device memory with it's length.
+ * It provides a test() device function to check if a given index is set in the bitset.
+ *
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset_view {
+  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
+
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Create a bitset view from a device vector view of the bitset.
+   *
+   * @param bitset_span Device vector view of the bitset
+   * @param bitset_len Number of bits in the bitset
+   */
+  _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
+                                index_t bitset_len)
+    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Device function to test if a given index is set in the bitset.
+   *
+   * @param sample_index Single index to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_HOST_DEVICE auto test(const index_t sample_index) const -> bool;
+  /**
+   * @brief Device function to test if a given index is set in the bitset.
+   *
+   * @param sample_index Single index to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_HOST_DEVICE auto operator[](const index_t sample_index) const -> bool;
+  /**
+   * @brief Device function to set a given index to set_value in the bitset.
+   *
+   * @param sample_index index to set
+   * @param set_value Value to set the bit to (true or false)
+   */
+  inline _RAFT_HOST_DEVICE void set(const index_t sample_index, bool set_value) const;
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline _RAFT_HOST_DEVICE auto data() -> bitset_t* { return bitset_ptr_; }
+  inline _RAFT_HOST_DEVICE auto data() const -> const bitset_t* { return bitset_ptr_; }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t;
+
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+  inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+
+ private:
+  bitset_t* bitset_ptr_;
+  index_t bitset_len_;
+};
+
+/**
+ * @brief RAFT Bitset.
+ *
+ * This structure encapsulates a bitset in device memory. It provides a view() method to get a
+ * device-usable lightweight view of the bitset.
+ * Each index is represented by a single bit in the bitset. The total number of bytes used is
+ * ceil(bitset_len / 8).
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset {
+  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
+
+  /**
+   * @brief Construct a new bitset object with a list of indices to unset.
+   *
+   * @param res RAFT resources
+   * @param mask_index List of indices to unset in the bitset
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res,
+         raft::device_vector_view<const index_t, index_t> mask_index,
+         index_t bitset_len,
+         bool default_value = true);
+
+  /**
+   * @brief Construct a new bitset object
+   *
+   * @param res RAFT resources
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res, index_t bitset_len, bool default_value = true);
+  // Disable copy constructor
+  bitset(const bitset&)            = delete;
+  bitset(bitset&&)                 = default;
+  bitset& operator=(const bitset&) = delete;
+  bitset& operator=(bitset&&)      = default;
+
+  /**
+   * @brief Create a device-usable view of the bitset.
+   *
+   * @return bitset_view<bitset_t, index_t>
+   */
+  inline auto view() -> raft::core::bitset_view<bitset_t, index_t>
+  {
+    return bitset_view<bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+  [[nodiscard]] inline auto view() const -> raft::core::bitset_view<const bitset_t, index_t>
+  {
+    return bitset_view<const bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline auto data() -> bitset_t* { return bitset_.data(); }
+  inline auto data() const -> const bitset_t* { return bitset_.data(); }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline auto n_elements() const -> index_t;
+
+  /** @brief Get an mdspan view of the current bitset */
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+  [[nodiscard]] inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+
+  /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to
+   * the default value.
+   * @param res RAFT resources
+   * @param new_bitset_len new size of the bitset
+   * @param default_value default value to initialize the new bits to
+   */
+  void resize(const raft::resources& res, index_t new_bitset_len, bool default_value = true);
+
+  /**
+   * @brief Test a list of indices in a bitset.
+   *
+   * @tparam output_t Output type of the test. Default is bool.
+   * @param res RAFT resources
+   * @param queries List of indices to test
+   * @param output List of outputs
+   */
+  template <typename output_t = bool>
+  void test(const raft::resources& res,
+            raft::device_vector_view<const index_t, index_t> queries,
+            raft::device_vector_view<output_t, index_t> output) const;
+  /**
+   * @brief Set a list of indices in a bitset to set_value.
+   *
+   * @param res RAFT resources
+   * @param mask_index indices to remove from the bitset
+   * @param set_value Value to set the bits to (true or false)
+   */
+  void set(const raft::resources& res,
+           raft::device_vector_view<const index_t, index_t> mask_index,
+           bool set_value = false);
+  /**
+   * @brief Flip all the bits in a bitset.
+   * @param res RAFT resources
+   */
+  void flip(const raft::resources& res);
+  /**
+   * @brief Reset the bits in a bitset.
+   *
+   * @param res RAFT resources
+   * @param default_value Value to set the bits to (true or false)
+   */
+  void reset(const raft::resources& res, bool default_value = true);
+  /**
+   * @brief Returns the number of bits set to true in count_gpu_scalar.
+   *
+   * @param[in] res RAFT resources
+   * @param[out] count_gpu_scalar Device scalar to store the count
+   */
+  void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar);
+  /**
+   * @brief Returns the number of bits set to true.
+   *
+   * @param res RAFT resources
+   * @return index_t Number of bits set to true
+   */
+  auto count(const raft::resources& res) -> index_t
+  {
+    auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
+    count(res, count_gpu_scalar.view());
+    index_t count_cpu = 0;
+    raft::update_host(
+      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+    resource::sync_stream(res);
+    return count_cpu;
+  }
+  /**
+   * @brief Checks if any of the bits are set to true in the bitset.
+   * @param res RAFT resources
+   */
+  bool any(const raft::resources& res) { return count(res) > 0; }
+  /**
+   * @brief Checks if all of the bits are set to true in the bitset.
+   * @param res RAFT resources
+   */
+  bool all(const raft::resources& res) { return count(res) == bitset_len_; }
+  /**
+   * @brief Checks if none of the bits are set to true in the bitset.
+   * @param res RAFT resources
+   */
+  bool none(const raft::resources& res) { return count(res) == 0; }
+
+ private:
+  raft::device_uvector<bitset_t> bitset_;
+  index_t bitset_len_;
+};
+
+/** @} */
+}  // end namespace raft::core
diff --git a/cpp/include/raft/core/detail/logger.hpp b/cpp/include/raft/core/detail/logger.hpp
index 532aee4d90..f3f52b46ae 100644
--- a/cpp/include/raft/core/detail/logger.hpp
+++ b/cpp/include/raft/core/detail/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                   \
                   " is deprecated and will be removed in future releases." \
                   " Please use the <raft/core/logger.hpp> version instead.")
+#endif
 
 #include <raft/core/logger.hpp>
diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index 82db75de84..253d8e5b93 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -24,23 +24,19 @@
 
 #include <cstdint>
 #include <cstdlib>
-#include <mutex>
+#include <limits>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <vector>
 
 namespace raft::common::nvtx::detail {
 
 /**
- * @brief An internal struct to store associated state with the color
- * generator
+ * @brief An internal struct to to initialize the color generator
  */
-struct color_gen_state {
-  /** collection of all tagged colors generated so far */
-  static inline std::unordered_map<std::string, uint32_t> all_colors_;
-  /** mutex for accessing the above map */
-  static inline std::mutex map_mutex_;
+struct color_gen {
+  /** This determines how many bits of the hash to use for the generator */
+  using hash_type = uint16_t;
   /** saturation */
   static inline constexpr float kS = 0.9f;
   /** value */
@@ -109,32 +105,22 @@ inline auto hsv2rgb(float h, float s, float v) -> uint32_t
 /**
  * @brief Helper method to generate 'visually distinct' colors.
  * Inspired from https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/
- * However, if an associated tag is passed, it will look up in its history for
- * any generated color against this tag and if found, just returns it, else
- * generates a new color, assigns a tag to it and stores it for future usage.
+ * It calculates a hash of the passed string and uses the result to generate
+ * distinct yet deterministic colors.
  * Such a thing is very useful for nvtx markers where the ranges associated
  * with a specific tag should ideally get the same color for the purpose of
  * visualizing it on nsight-systems timeline.
- * @param tag look for any previously generated colors with this tag or
- * associate the currently generated color with it
+ * @param tag a string used as an input to generate a distinct color.
  * @return returns 32b RGB integer with alpha channel set of 0xff
  */
 inline auto generate_next_color(const std::string& tag) -> uint32_t
 {
-  // std::unordered_map<std::string, uint32_t> color_gen_state::all_colors_;
-  // std::mutex color_gen_state::map_mutex_;
-
-  std::lock_guard<std::mutex> guard(color_gen_state::map_mutex_);
-  if (!tag.empty()) {
-    auto itr = color_gen_state::all_colors_.find(tag);
-    if (itr != color_gen_state::all_colors_.end()) { return itr->second; }
-  }
-  auto h = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-  h += color_gen_state::kInvPhi;
+  auto x = static_cast<color_gen::hash_type>(std::hash<std::string>{}(tag));
+  auto u = std::numeric_limits<color_gen::hash_type>::max();
+  auto h = static_cast<float>(x) / static_cast<float>(u);
+  h += color_gen::kInvPhi;
   if (h >= 1.f) h -= 1.f;
-  auto rgb = hsv2rgb(h, color_gen_state::kS, color_gen_state::kV);
-  if (!tag.empty()) { color_gen_state::all_colors_[tag] = rgb; }
-  return rgb;
+  return hsv2rgb(h, color_gen::kS, color_gen::kV);
 }
 
 template <typename Domain, typename = Domain>
diff --git a/cpp/include/raft/core/detail/popc.cuh b/cpp/include/raft/core/detail/popc.cuh
new file mode 100644
index 0000000000..d74b68b715
--- /dev/null
+++ b/cpp/include/raft/core/detail/popc.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/detail/mdspan_util.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+
+namespace raft::detail {
+
+/**
+ * @brief Count the number of bits that are set to 1 in a vector.
+ *
+ * @tparam value_t the value type of the vector.
+ * @tparam index_t the index type of vector and scalar.
+ *
+ * @param[in] res raft handle for managing expensive resources
+ * @param[in] values Number of row in the matrix.
+ * @param[in] max_len Maximum number of bits to count.
+ * @param[out] counter Number of bits that are set to 1.
+ */
+template <typename value_t, typename index_t>
+void popc(const raft::resources& res,
+          device_vector_view<value_t, index_t> values,
+          index_t max_len,
+          raft::device_scalar_view<index_t> counter)
+{
+  auto values_size   = values.size();
+  auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
+    values.data_handle(), values_size, 1);
+  auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);
+
+  static constexpr index_t len_per_item = sizeof(value_t) * 8;
+
+  value_t tail_len  = (max_len % len_per_item);
+  value_t tail_mask = tail_len ? (value_t)((value_t{1} << tail_len) - value_t{1}) : ~value_t{0};
+  raft::linalg::coalesced_reduction(
+    res,
+    values_matrix,
+    counter_vector,
+    index_t{0},
+    false,
+    [tail_mask, values_size] __device__(value_t value, index_t index) {
+      index_t result = 0;
+      if constexpr (len_per_item == 64) {
+        if (index == values_size - 1)
+          result = index_t(raft::detail::popc(value & tail_mask));
+        else
+          result = index_t(raft::detail::popc(value));
+      } else {  // Needed because popc is not overloaded for 16 and 8 bit elements
+        if (index == values_size - 1)
+          result = index_t(raft::detail::popc(uint32_t{value} & tail_mask));
+        else
+          result = index_t(raft::detail::popc(uint32_t{value}));
+      }
+
+      return result;
+    });
+}
+
+}  // end namespace raft::detail
\ No newline at end of file
diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index 8c6eff582b..18d8b77364 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -31,7 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_ptr.h>
 
@@ -117,7 +118,7 @@ class device_uvector {
    */
   explicit device_uvector(std::size_t size,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
     : data_{size, stream, mr}
   {
   }
@@ -164,19 +165,11 @@ class device_uvector_policy {
  public:
   auto create(raft::resources const& res, size_t n) -> container_type
   {
-    if (mr_ == nullptr) {
-      // NB: not using the workspace resource by default!
-      //     The workspace resource is for short-lived temporary allocations.
-      return container_type(n, resource::get_cuda_stream(res));
-    } else {
-      return container_type(n, resource::get_cuda_stream(res), mr_);
-    }
+    return container_type(n, resource::get_cuda_stream(res), mr_);
   }
 
   constexpr device_uvector_policy() = default;
-  constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr)
-  {
-  }
+  explicit device_uvector_policy(rmm::device_async_resource_ref mr) noexcept : mr_(mr) {}
 
   [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
   {
@@ -192,7 +185,7 @@ class device_uvector_policy {
   [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
 
  private:
-  rmm::mr::device_memory_resource* mr_{nullptr};
+  rmm::device_async_resource_ref mr_{rmm::mr::get_current_device_resource()};
 };
 
 }  // namespace raft
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 855642cd76..a34f6e2e02 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -21,6 +21,8 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/resources.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cstdint>
 
 namespace raft {
@@ -107,7 +109,7 @@ template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous,
           size_t... Extents>
 auto make_device_mdarray(raft::resources const& handle,
-                         rmm::mr::device_memory_resource* mr,
+                         rmm::device_async_resource_ref mr,
                          extents<IndexType, Extents...> exts)
 {
   using mdarray_t = device_mdarray<ElementType, decltype(exts), LayoutPolicy>;
diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index 366e387fdd..856ecc96d7 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -37,6 +37,7 @@
 
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime.h>
 
@@ -120,7 +121,7 @@ class device_resources : public resources {
 
   cusparseHandle_t get_cusparse_handle() const { return resource::get_cusparse_handle(*this); }
 
-  rmm::exec_policy& get_thrust_policy() const { return resource::get_thrust_policy(*this); }
+  rmm::exec_policy_nosync& get_thrust_policy() const { return resource::get_thrust_policy(*this); }
 
   /**
    * @brief synchronize a stream on the current container
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 9aa9e4fb85..b785010a0a 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,16 @@ namespace raft::resource {
  * @{
  */
 
+class device_memory_resource : public resource {
+ public:
+  explicit device_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr) : mr_(mr) {}
+  ~device_memory_resource() override = default;
+  auto get_resource() -> void* override { return mr_.get(); }
+
+ private:
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_;
+};
+
 class limiting_memory_resource : public resource {
  public:
   limiting_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr,
@@ -66,6 +76,29 @@ class limiting_memory_resource : public resource {
   }
 };
 
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+class large_workspace_resource_factory : public resource_factory {
+ public:
+  explicit large_workspace_resource_factory(
+    std::shared_ptr<rmm::mr::device_memory_resource> mr = {nullptr})
+    : mr_{mr ? mr
+             : std::shared_ptr<rmm::mr::device_memory_resource>{
+                 rmm::mr::get_current_device_resource(), void_op{}}}
+  {
+  }
+  auto get_resource_type() -> resource_type override
+  {
+    return resource_type::LARGE_WORKSPACE_RESOURCE;
+  }
+  auto make_resource() -> resource* override { return new device_memory_resource(mr_); }
+
+ private:
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_;
+};
+
 /**
  * Factory that knows how to construct a specific raft::resource to populate
  * the resources instance.
@@ -144,7 +177,7 @@ class workspace_resource_factory : public resource_factory {
     // Note, the workspace does not claim all this memory from the start, so it's still usable by
     // the main resource as well.
     // This limit is merely an order for algorithm internals to plan the batching accordingly.
-    return total_size / 2;
+    return total_size / 4;
   }
 };
 
@@ -241,6 +274,21 @@ inline void set_workspace_to_global_resource(
     workspace_resource_factory::default_plain_resource(), allocation_limit, std::nullopt));
 };
 
+inline auto get_large_workspace_resource(resources const& res) -> rmm::mr::device_memory_resource*
+{
+  if (!res.has_resource_factory(resource_type::LARGE_WORKSPACE_RESOURCE)) {
+    res.add_resource_factory(std::make_shared<large_workspace_resource_factory>());
+  }
+  return res.get_resource<rmm::mr::device_memory_resource>(resource_type::LARGE_WORKSPACE_RESOURCE);
+};
+
+inline void set_large_workspace_resource(resources const& res,
+                                         std::shared_ptr<rmm::mr::device_memory_resource> mr = {
+                                           nullptr})
+{
+  res.add_resource_factory(std::make_shared<large_workspace_resource_factory>(mr));
+};
+
 /** @} */
 
 }  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index d2021728c4..d9126251c9 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -28,23 +28,24 @@ namespace raft::resource {
  */
 enum resource_type {
   // device-specific resource types
-  CUBLAS_HANDLE = 0,       // cublas handle
-  CUSOLVER_DN_HANDLE,      // cusolver dn handle
-  CUSOLVER_SP_HANDLE,      // cusolver sp handle
-  CUSPARSE_HANDLE,         // cusparse handle
-  CUDA_STREAM_VIEW,        // view of a cuda stream
-  CUDA_STREAM_POOL,        // cuda stream pool
-  CUDA_STREAM_SYNC_EVENT,  // cuda event for syncing streams
-  COMMUNICATOR,            // raft communicator
-  SUB_COMMUNICATOR,        // raft sub communicator
-  DEVICE_PROPERTIES,       // cuda device properties
-  DEVICE_ID,               // cuda device id
-  STREAM_VIEW,             // view of a cuda stream or a placeholder in
-                           // CUDA-free builds
-  THRUST_POLICY,           // thrust execution policy
-  WORKSPACE_RESOURCE,      // rmm device memory resource
-  CUBLASLT_HANDLE,         // cublasLt handle
-  CUSTOM,                  // runtime-shared default-constructible resource
+  CUBLAS_HANDLE = 0,         // cublas handle
+  CUSOLVER_DN_HANDLE,        // cusolver dn handle
+  CUSOLVER_SP_HANDLE,        // cusolver sp handle
+  CUSPARSE_HANDLE,           // cusparse handle
+  CUDA_STREAM_VIEW,          // view of a cuda stream
+  CUDA_STREAM_POOL,          // cuda stream pool
+  CUDA_STREAM_SYNC_EVENT,    // cuda event for syncing streams
+  COMMUNICATOR,              // raft communicator
+  SUB_COMMUNICATOR,          // raft sub communicator
+  DEVICE_PROPERTIES,         // cuda device properties
+  DEVICE_ID,                 // cuda device id
+  STREAM_VIEW,               // view of a cuda stream or a placeholder in
+                             // CUDA-free builds
+  THRUST_POLICY,             // thrust execution policy
+  WORKSPACE_RESOURCE,        // rmm device memory resource for small temporary allocations
+  CUBLASLT_HANDLE,           // cublasLt handle
+  CUSTOM,                    // runtime-shared default-constructible resource
+  LARGE_WORKSPACE_RESOURCE,  // rmm device memory resource for somewhat large temporary allocations
 
   LAST_KEY  // reserved for the last key
 };
diff --git a/cpp/include/raft/core/resource/thrust_policy.hpp b/cpp/include/raft/core/resource/thrust_policy.hpp
index f81898be8a..c728f0a00e 100644
--- a/cpp/include/raft/core/resource/thrust_policy.hpp
+++ b/cpp/include/raft/core/resource/thrust_policy.hpp
@@ -24,7 +24,7 @@ namespace raft::resource {
 class thrust_policy_resource : public resource {
  public:
   thrust_policy_resource(rmm::cuda_stream_view stream_view)
-    : thrust_policy_(std::make_unique<rmm::exec_policy>(stream_view))
+    : thrust_policy_(std::make_unique<rmm::exec_policy_nosync>(stream_view))
   {
   }
   void* get_resource() override { return thrust_policy_.get(); }
@@ -32,7 +32,7 @@ class thrust_policy_resource : public resource {
   ~thrust_policy_resource() override {}
 
  private:
-  std::unique_ptr<rmm::exec_policy> thrust_policy_;
+  std::unique_ptr<rmm::exec_policy_nosync> thrust_policy_;
 };
 
 /**
@@ -60,13 +60,13 @@ class thrust_policy_resource_factory : public resource_factory {
  * @param res raft res object for managing resources
  * @return thrust execution policy
  */
-inline rmm::exec_policy& get_thrust_policy(resources const& res)
+inline rmm::exec_policy_nosync& get_thrust_policy(resources const& res)
 {
   if (!res.has_resource_factory(resource_type::THRUST_POLICY)) {
     rmm::cuda_stream_view stream = get_cuda_stream(res);
     res.add_resource_factory(std::make_shared<thrust_policy_resource_factory>(stream));
   }
-  return *res.get_resource<rmm::exec_policy>(resource_type::THRUST_POLICY);
+  return *res.get_resource<rmm::exec_policy_nosync>(resource_type::THRUST_POLICY);
 };
 
 /**
diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh
index 3e3699766f..951e030cbd 100644
--- a/cpp/include/raft/distance/detail/masked_nn.cuh
+++ b/cpp/include/raft/distance/detail/masked_nn.cuh
@@ -256,9 +256,8 @@ void masked_l2_nn_impl(raft::resources const& handle,
   static_assert(P::Mblk == 64, "masked_l2_nn_impl only supports a policy with 64 rows per block.");
 
   // Get stream and workspace memory resource
-  rmm::mr::device_memory_resource* ws_mr =
-    dynamic_cast<rmm::mr::device_memory_resource*>(resource::get_workspace_resource(handle));
   auto stream = resource::get_cuda_stream(handle);
+  auto ws_mr  = resource::get_workspace_resource(handle);
 
   // Acquire temporary buffers and initialize to zero:
   // 1) Adjacency matrix bitfield
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/distance/specializations.cuh
+++ b/cpp/include/raft/distance/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
+++ b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index f7828294cd..b06cd113c1 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/solver version instead.")
+#endif
 
 #include <raft/solver/linear_assignment.cuh>
 
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index 5472422053..0f1ad14ed5 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the cuh version instead.")
+#endif
 
 #include <raft/solver/linear_assignment.cuh>
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index d580ea72c1..9f3be7ce0e 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -28,11 +28,18 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <int warpSize, int rpb>
+template <int warpSize, int tpb, int rpw, bool noLoop = false>
 struct ReductionThinPolicy {
-  static constexpr int LogicalWarpSize = warpSize;
-  static constexpr int RowsPerBlock    = rpb;
-  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+  static_assert(tpb % warpSize == 0);
+
+  static constexpr int LogicalWarpSize    = warpSize;
+  static constexpr int ThreadsPerBlock    = tpb;
+  static constexpr int RowsPerLogicalWarp = rpw;
+  static constexpr int NumLogicalWarps    = ThreadsPerBlock / LogicalWarpSize;
+  static constexpr int RowsPerBlock       = NumLogicalWarps * RowsPerLogicalWarp;
+
+  // Whether D (run-time arg) will be smaller than warpSize (compile-time parameter)
+  static constexpr bool NoSequentialReduce = noLoop;
 };
 
 template <typename Policy,
@@ -53,19 +60,72 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
                                FinalLambda final_op,
                                bool inplace = false)
 {
-  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
-  if (i >= N) return;
+  /* The strategy to achieve near-SOL memory bandwidth differs based on D:
+   *  - For small D, we need to process multiple rows per logical warp in order to have
+   *    multiple loads per thread and increase bytes in flight and amortize latencies.
+   *  - For large D, we start with a sequential reduction. The compiler partially unrolls
+   *    that loop (e.g. first a loop of stride 16, then 8, 4, and 1).
+   */
+  IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i0 >= N) return;
 
-  OutType acc = init;
-  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
-    acc = reduce_op(acc, main_op(data[j + (D * i)], j));
+  OutType acc[Policy::RowsPerLogicalWarp];
+#pragma unroll
+  for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+    acc[k] = init;
   }
-  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
-  if (threadIdx.x == 0) {
+
+  if constexpr (Policy::NoSequentialReduce) {
+    IdxType j = threadIdx.x;
+    if (j < D) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read.
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  } else {
+    for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  }
+
+  /* This vector reduction has two benefits compared to naive separate reductions:
+   * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles
+   *   are required and there is no initial sequential reduction to amortize that cost).
+   * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of
+   *   rows per logical warp and logical warp size are equal.
+   */
+  raft::logicalWarpReduceVector<Policy::LogicalWarpSize, Policy::RowsPerLogicalWarp>(
+    acc, threadIdx.x, reduce_op);
+
+  constexpr int reducOutVecWidth =
+    std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize);
+  constexpr int reducOutGroupSize =
+    std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp);
+  constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize;
+
+  if (threadIdx.x % reducOutGroupSize == 0) {
+    const int groupId = threadIdx.x / reducOutGroupSize;
     if (inplace) {
-      dots[i] = final_op(reduce_op(dots[i], acc));
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(reduce_op(dots[i], acc[k])); }
+      }
     } else {
-      dots[i] = final_op(acc);
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(acc[k]); }
+      }
     }
   }
 }
@@ -89,8 +149,12 @@ void coalescedReductionThin(OutType* dots,
                             FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
-  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+    "coalescedReductionThin<%d,%d,%d,%d>",
+    Policy::LogicalWarpSize,
+    Policy::ThreadsPerBlock,
+    Policy::RowsPerLogicalWarp,
+    static_cast<int>(Policy::NoSequentialReduce));
+  dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1);
   dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
   coalescedReductionThinKernel<Policy>
     <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
@@ -115,19 +179,28 @@ void coalescedReductionThinDispatcher(OutType* dots,
                                       FinalLambda final_op   = raft::identity_op())
 {
   if (D <= IdxType(2)) {
-    coalescedReductionThin<ReductionThinPolicy<2, 64>>(
+    coalescedReductionThin<ReductionThinPolicy<2, 128, 8, true>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (D <= IdxType(4)) {
-    coalescedReductionThin<ReductionThinPolicy<4, 32>>(
+    coalescedReductionThin<ReductionThinPolicy<4, 128, 8, true>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (D <= IdxType(8)) {
-    coalescedReductionThin<ReductionThinPolicy<8, 16>>(
+    coalescedReductionThin<ReductionThinPolicy<8, 128, 8, true>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (D <= IdxType(16)) {
-    coalescedReductionThin<ReductionThinPolicy<16, 8>>(
+    coalescedReductionThin<ReductionThinPolicy<16, 128, 8, true>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(32)) {
+    coalescedReductionThin<ReductionThinPolicy<32, 128, 8, true>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D < IdxType(128)) {
+    coalescedReductionThin<ReductionThinPolicy<32, 128, 4, false>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReductionThin<ReductionThinPolicy<32, 4>>(
+    // For D=128 (included) and above, the 4x-unrolled loading loop is used
+    // and multiple rows per warp are counter-productive in terms of cache-friendliness
+    // and register use.
+    coalescedReductionThin<ReductionThinPolicy<32, 128, 1, false>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
@@ -319,10 +392,10 @@ void coalescedReductionThickDispatcher(OutType* dots,
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
   if (D < IdxType(32768)) {
-    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 4>>(
+    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 128, 1>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 4>>(
+    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 245f8eb4b0..236c840040 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -15,9 +15,11 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Use cublaslt_wrappers.hpp if you really need this low-level api.")
+#endif
 
 #include "cublaslt_wrappers.hpp"
 
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index c9dcbda5cc..7b8d35706b 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -18,9 +18,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Use raft/linalg/gemm.hpp instead.")
+#endif
 
 #include "detail/gemm.hpp"
 #include "gemm.hpp"  // Part of the API transferred to the non-deprecated file
diff --git a/cpp/include/raft/linalg/lanczos.cuh b/cpp/include/raft/linalg/lanczos.cuh
index 04e9980583..0117a8e1d4 100644
--- a/cpp/include/raft/linalg/lanczos.cuh
+++ b/cpp/include/raft/linalg/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse solvers version instead.")
+#endif
 
 #include <raft/sparse/solver/lanczos.cuh>
 
diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
index 1f60860c8c..de5f4e62ce 100644
--- a/cpp/include/raft/linalg/normalize.cuh
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -18,9 +18,11 @@
 
 #include "detail/normalize.cuh"
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/norm_types.hpp>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 506cbffcb9..6db1a5acac 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -20,9 +20,6 @@
 #include <raft/matrix/select_k_types.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
-#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
-#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
-
 #include <cuda_fp16.h>  // __half
 
 #include <cstdint>  // uint32_t
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 36a346fda3..2207b0216e 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -29,9 +29,9 @@
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -442,14 +442,76 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass>
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   int pass,
+                                   const T*& in_buf,
+                                   const IdxT*& in_idx_buf,
+                                   T*& out_buf,
+                                   IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    in_buf      = reinterpret_cast<T*>(bufs);
+    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    out_buf     = const_cast<T*>(in_buf + buf_len);
+    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    in_buf      = out_buf + buf_len;
+    in_idx_buf  = out_idx_buf + buf_len;
+  }
+}
+
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   const int pass,
+                                   const T*& out_buf,
+                                   const IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    out_buf = const_cast<T*>(reinterpret_cast<T*>(bufs) + buf_len);
+    out_idx_buf =
+      const_cast<IdxT*>(reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len) + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  }
+}
+
+template <typename T, typename IdxT, int BitsPerPass, bool len_or_indptr = true>
 RAFT_KERNEL last_filter_kernel(const T* in,
                                const IdxT* in_idx,
-                               const T* in_buf,
-                               const IdxT* in_idx_buf,
+                               char* bufs,
+                               size_t offset,
                                T* out,
                                IdxT* out_idx,
                                const IdxT len,
+                               const IdxT* len_i,
                                const IdxT k,
                                Counter<T, IdxT>* counters,
                                const bool select_min)
@@ -458,22 +520,31 @@ RAFT_KERNEL last_filter_kernel(const T* in,
 
   Counter<T, IdxT>* counter = counters + batch_id;
   IdxT previous_len         = counter->previous_len;
+
   if (previous_len == 0) { return; }
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   const IdxT buf_len = calc_buf_len<T>(len);
-  if (previous_len > buf_len || in_buf == in) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
-  }
-  out += batch_id * k;
-  out_idx += batch_id * k;
+
+  const T* in_buf        = nullptr;
+  const IdxT* in_idx_buf = nullptr;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
   constexpr int pass      = calc_num_passes<T, BitsPerPass>() - 1;
   constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
 
+  set_buf_pointers(in + l_offset, in_idx + l_offset, bufs, buf_len, pass, in_buf, in_idx_buf);
+
+  if (previous_len > buf_len || in_buf == in + l_offset) {
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
+  }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+
   const auto kth_value_bits    = counter->kth_value_bits;
   const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
@@ -510,6 +581,29 @@ RAFT_KERNEL last_filter_kernel(const T* in,
                      f);
 }
 
+template <typename T, typename IdxT, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_val(
+  T* dest, const T* src, S len, IdxT k, const bool select_min)
+{
+  S idx               = S(threadIdx.x);
+  S stride            = S(blockDim.x);
+  const T default_val = select_min ? upper_bound<T>() : lower_bound<T>();
+  for (S i = idx; i < k; i += stride) {
+    dest[i] = i < len ? src[i] : default_val;
+  }
+}
+
+template <typename T, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_idx(T* dest, const T* src, S len)
+{
+  S idx    = S(threadIdx.x);
+  S stride = S(blockDim.x);
+
+  for (S i = idx; i < len; i += stride) {
+    dest[i] = src ? src[i] : i;
+  }
+}
+
 /**
  *
  * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
@@ -545,13 +639,16 @@ RAFT_KERNEL last_filter_kernel(const T* in,
  * rather than from `in_buf`. The benefit is that we can save the cost of writing candidates and
  * their indices.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
+template <typename T,
+          typename IdxT,
+          int BitsPerPass,
+          int BlockSize,
+          bool fused_last_filter,
+          bool len_or_indptr>
 RAFT_KERNEL radix_kernel(const T* in,
                          const IdxT* in_idx,
-                         const T* in_buf,
-                         const IdxT* in_idx_buf,
-                         T* out_buf,
-                         IdxT* out_idx_buf,
+                         char* bufs,
+                         size_t offset,
                          T* out,
                          IdxT* out_idx,
                          Counter<T, IdxT>* counters,
@@ -567,21 +664,38 @@ RAFT_KERNEL radix_kernel(const T* in,
   IdxT current_k;
   IdxT previous_len;
   IdxT current_len;
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   if (pass == 0) {
     current_k    = k;
-    previous_len = len;
+    previous_len = l_len;
     // Need to do this so setting counter->previous_len for the next pass is correct.
     // This value is meaningless for pass 0, but it's fine because pass 0 won't be the
     // last pass in this implementation so pass 0 won't hit the "if (pass ==
     // num_passes - 1)" branch.
     // Maybe it's better to reload counter->previous_len and use it rather than
     // current_len in last_filter()
-    current_len = len;
+    current_len = l_len;
   } else {
     current_k    = counter->k;
     current_len  = counter->len;
     previous_len = counter->previous_len;
   }
+  if constexpr (!len_or_indptr) {
+    if (pass == 0 && l_len <= k) {
+      copy_in_val(out + batch_id * k, in + l_offset, l_len, k, select_min);
+      copy_in_idx(out_idx + batch_id * k, (in_idx ? (in_idx + l_offset) : nullptr), l_len);
+      if (threadIdx.x == 0) {
+        counter->previous_len = 0;
+        counter->len          = 0;
+      }
+      __syncthreads();
+      return;
+    }
+  }
+
   if (current_len == 0) { return; }
 
   // When k=len, early_stop will be true at pass 0. It means filter_and_histogram() should handle
@@ -590,20 +704,33 @@ RAFT_KERNEL radix_kernel(const T* in,
   const bool early_stop = (current_len == current_k);
   const IdxT buf_len    = calc_buf_len<T>(len);
 
+  const T* in_buf;
+  const IdxT* in_idx_buf;
+  T* out_buf;
+  IdxT* out_idx_buf;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
+
+  set_buf_pointers(in + l_offset,
+                   (in_idx ? (in_idx + l_offset) : nullptr),
+                   bufs,
+                   buf_len,
+                   pass,
+                   in_buf,
+                   in_idx_buf,
+                   out_buf,
+                   out_idx_buf);
+
   // "previous_len > buf_len" means previous pass skips writing buffer
   if (pass == 0 || pass == 1 || previous_len > buf_len) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
   }
 
   // in case we have individual len for each query defined we want to make sure
   // that we only iterate valid elements.
   if (len_i != nullptr) {
-    const IdxT max_len = max(len_i[batch_id], k);
+    const IdxT max_len = max(l_len, k);
     if (max_len < previous_len) previous_len = max_len;
   }
 
@@ -611,9 +738,6 @@ RAFT_KERNEL radix_kernel(const T* in,
   if (pass == 0 || current_len > buf_len) {
     out_buf     = nullptr;
     out_idx_buf = nullptr;
-  } else {
-    out_buf += batch_id * buf_len;
-    out_idx_buf += batch_id * buf_len;
   }
   out += batch_id * k;
   out_idx += batch_id * k;
@@ -640,7 +764,6 @@ RAFT_KERNEL radix_kernel(const T* in,
     unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
     isLastBlock           = (finished == (gridDim.x - 1));
   }
-
   if (__syncthreads_or(isLastBlock)) {
     if (early_stop) {
       if (threadIdx.x == 0) {
@@ -676,7 +799,7 @@ RAFT_KERNEL radix_kernel(const T* in,
                                           out_idx_buf ? out_idx_buf : in_idx_buf,
                                           out,
                                           out_idx,
-                                          out_buf ? current_len : len,
+                                          out_buf ? current_len : l_len,
                                           k,
                                           counter,
                                           select_min,
@@ -726,7 +849,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 
   int active_blocks;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>, BlockSize, 0));
+    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, true>, BlockSize, 0));
   active_blocks *= sm_cnt;
 
   IdxT best_num_blocks         = 0;
@@ -757,78 +880,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
   return best_num_blocks;
 }
 
-template <typename T, typename IdxT>
-_RAFT_HOST void set_buf_pointers(const T* in,
-                                 const IdxT* in_idx,
-                                 T* buf1,
-                                 IdxT* idx_buf1,
-                                 T* buf2,
-                                 IdxT* idx_buf2,
-                                 int pass,
-                                 const T*& in_buf,
-                                 const IdxT*& in_idx_buf,
-                                 T*& out_buf,
-                                 IdxT*& out_idx_buf)
-{
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  } else if (pass % 2 == 0) {
-    in_buf      = buf1;
-    in_idx_buf  = idx_buf1;
-    out_buf     = buf2;
-    out_idx_buf = idx_buf2;
-  } else {
-    in_buf      = buf2;
-    in_idx_buf  = idx_buf2;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  }
-}
-
-template <typename T, typename IdxT>
-_RAFT_DEVICE void set_buf_pointers(const T* in,
-                                   const IdxT* in_idx,
-                                   char* bufs,
-                                   IdxT buf_len,
-                                   int pass,
-                                   const T*& in_buf,
-                                   const IdxT*& in_idx_buf,
-                                   T*& out_buf,
-                                   IdxT*& out_idx_buf)
-{
-  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-  } else if (pass % 2 == 0) {
-    in_buf      = reinterpret_cast<T*>(bufs);
-    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    out_buf     = const_cast<T*>(in_buf + buf_len);
-    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
-  } else {
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    in_buf      = out_buf + buf_len;
-    in_idx_buf  = out_idx_buf + buf_len;
-  }
-}
-
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
                 int batch_size,
@@ -842,15 +894,13 @@ void radix_topk(const T* in,
                 unsigned grid_dim,
                 int sm_cnt,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   // TODO: is it possible to relax this restriction?
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
-  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
   if (max_chunk_size != static_cast<size_t>(batch_size)) {
@@ -862,55 +912,33 @@ void radix_topk(const T* in,
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
-  rmm::device_uvector<T> buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<T> buf2(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * buf_len, stream, mr);
+
+  rmm::device_uvector<char> bufs(
+    max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
 
-    const T* chunk_in        = in + offset * len;
-    const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
-    T* chunk_out             = out + offset * k;
-    IdxT* chunk_out_idx      = out_idx + offset * k;
-    const IdxT* chunk_len_i  = len_i ? (len_i + offset) : nullptr;
-
-    const T* in_buf        = nullptr;
-    const IdxT* in_idx_buf = nullptr;
-    T* out_buf             = nullptr;
-    IdxT* out_idx_buf      = nullptr;
+    T* chunk_out            = out + offset * k;
+    IdxT* chunk_out_idx     = out_idx + offset * k;
+    const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
 
     dim3 blocks(grid_dim, chunk_size);
     constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
     for (int pass = 0; pass < num_passes; ++pass) {
-      set_buf_pointers(chunk_in,
-                       chunk_in_idx,
-                       buf1.data(),
-                       idx_buf1.data(),
-                       buf2.data(),
-                       idx_buf2.data(),
-                       pass,
-                       in_buf,
-                       in_idx_buf,
-                       out_buf,
-                       out_idx_buf);
-
       if (fused_last_filter && pass == num_passes - 1) {
-        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true>;
+        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true, len_or_indptr>;
       }
 
-      kernel<<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                               chunk_in_idx,
-                                               in_buf,
-                                               in_idx_buf,
-                                               out_buf,
-                                               out_idx_buf,
+      kernel<<<blocks, BlockSize, 0, stream>>>(in,
+                                               in_idx,
+                                               bufs.data(),
+                                               offset,
                                                chunk_out,
                                                chunk_out_idx,
                                                counters.data(),
@@ -924,16 +952,18 @@ void radix_topk(const T* in,
     }
 
     if (!fused_last_filter) {
-      last_filter_kernel<T, IdxT, BitsPerPass><<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                                                                 chunk_in_idx,
-                                                                                 out_buf,
-                                                                                 out_idx_buf,
-                                                                                 chunk_out,
-                                                                                 chunk_out_idx,
-                                                                                 len,
-                                                                                 k,
-                                                                                 counters.data(),
-                                                                                 select_min);
+      last_filter_kernel<T, IdxT, BitsPerPass, len_or_indptr>
+        <<<blocks, BlockSize, 0, stream>>>(in,
+                                           in_idx,
+                                           bufs.data(),
+                                           offset,
+                                           chunk_out,
+                                           chunk_out_idx,
+                                           len,
+                                           chunk_len_i,
+                                           k,
+                                           counters.data(),
+                                           select_min);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
   }
@@ -1015,7 +1045,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         const IdxT* in_idx,
                                         const IdxT len,
@@ -1024,30 +1054,48 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         T* out,
                                         IdxT* out_idx,
                                         const bool select_min,
-                                        char* bufs)
+                                        char* bufs,
+                                        size_t offset)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
   __shared__ IdxT histogram[num_buckets];
 
+  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
+
+  IdxT l_len    = len;
+  IdxT l_offset = (offset + batch_id) * len;
+  if constexpr (!len_or_indptr) {
+    l_offset = len_i[batch_id];
+    l_len    = len_i[batch_id + 1] - l_offset;
+  }
+
   if (threadIdx.x == 0) {
     counter.k              = k;
-    counter.len            = len;
-    counter.previous_len   = len;
+    counter.len            = l_len;
+    counter.previous_len   = l_len;
     counter.kth_value_bits = 0;
     counter.out_cnt        = 0;
     counter.out_back_cnt   = 0;
   }
   __syncthreads();
 
-  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
-  in += batch_id * len;
-  if (in_idx) { in_idx += batch_id * len; }
+  in += l_offset;
+  if (in_idx) { in_idx += l_offset; }
   out += batch_id * k;
   out_idx += batch_id * k;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
+  if constexpr (!len_or_indptr) {
+    if (l_len <= k) {
+      copy_in_val(out, in, l_len, k, select_min);
+      copy_in_idx(out_idx, in_idx, l_len);
+      __syncthreads();
+      return;
+    }
+  }
+
   constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
   for (int pass = 0; pass < num_passes; ++pass) {
     const T* in_buf;
@@ -1073,7 +1121,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
     // in case we have individual len for each query defined we want to make sure
     // that we only iterate valid elements.
     if (len_i != nullptr) {
-      const IdxT max_len = max(len_i[batch_id], k);
+      const IdxT max_len = max(l_len, k);
       if (max_len < previous_len) previous_len = max_len;
     }
 
@@ -1102,7 +1150,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         out_buf ? out_idx_buf : in_idx,
                                         out,
                                         out_idx,
-                                        out_buf ? current_len : len,
+                                        out_buf ? current_len : l_len,
                                         k,
                                         &counter,
                                         select_min,
@@ -1117,7 +1165,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // counters and global histograms, can be kept in shared memory and cheap sync operations can be
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk_one_block(const T* in,
                           const IdxT* in_idx,
                           int batch_size,
@@ -1129,11 +1177,11 @@ void radix_topk_one_block(const T* in,
                           const IdxT* len_i,
                           int sm_cnt,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
-  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, true);
@@ -1144,15 +1192,16 @@ void radix_topk_one_block(const T* in,
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
-    kernel<<<chunk_size, BlockSize, 0, stream>>>(in + offset * len,
-                                                 in_idx ? (in_idx + offset * len) : nullptr,
+    kernel<<<chunk_size, BlockSize, 0, stream>>>(in,
+                                                 in_idx,
                                                  len,
                                                  chunk_len_i,
                                                  k,
                                                  out + offset * k,
                                                  out_idx + offset * k,
                                                  select_min,
-                                                 bufs.data());
+                                                 bufs.data(),
+                                                 offset);
   }
 }
 
@@ -1182,6 +1231,10 @@ void radix_topk_one_block(const T* in,
  *   it affects the number of passes and number of buckets.
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
+ * @tparam len_or_indptr
+ *   Flag to interpret `len_i` as either direct row lengths (true) or CSR format
+ *   index pointers (false). When true, each `len_i` element denotes the length of a row. When
+ *   false, `len_i` represents the index pointers for a CSR matrix with shape of `batch_size + 1`.
  *
  * @param[in] res container of reusable resources
  * @param[in] in
@@ -1212,9 +1265,12 @@ void radix_topk_one_block(const T* in,
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
  * @param len_i
- *   optional array of size (batch_size) providing lengths for each individual row
+ *   Optional array used differently based on `len_or_indptr`:
+ *   When `len_or_indptr` is true, `len_i` presents the lengths of each row, which is `batch_size`.
+ *   When `len_or_indptr` is false, `len_i` works like a indptr for a CSR matrix. The length of each
+ *   row would be (`len_i[row_id + 1] - len_i[row_id]`). `len_i` size is `batch_size + 1`.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr = true>
 void select_k(raft::resources const& res,
               const T* in,
               const IdxT* in_idx,
@@ -1227,9 +1283,12 @@ void select_k(raft::resources const& res,
               bool fused_last_filter,
               const IdxT* len_i)
 {
+  RAFT_EXPECTS(!(!len_or_indptr && (len_i == nullptr)),
+               "When `len_or_indptr` is false, `len_i` must not be nullptr!");
+
   auto stream = resource::get_cuda_stream(res);
   auto mr     = resource::get_workspace_resource(res);
-  if (k == len) {
+  if (k == len && len_or_indptr) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1248,29 +1307,29 @@ void select_k(raft::resources const& res,
   constexpr int items_per_thread = 32;
 
   if (len <= BlockSize * items_per_thread) {
-    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize>(in,
-                                                        in_idx,
-                                                        batch_size,
-                                                        len,
-                                                        k,
-                                                        out,
-                                                        out_idx,
-                                                        select_min,
-                                                        fused_last_filter,
-                                                        len_i,
-                                                        grid_dim,
-                                                        sm_cnt,
-                                                        stream,
-                                                        mr);
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(in,
+                                                                       in_idx,
+                                                                       batch_size,
+                                                                       len,
+                                                                       k,
+                                                                       out,
+                                                                       out_idx,
+                                                                       select_min,
+                                                                       fused_last_filter,
+                                                                       len_i,
+                                                                       grid_dim,
+                                                                       sm_cnt,
+                                                                       stream,
+                                                                       mr);
     }
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 572558153d..7da659291c 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -27,8 +27,9 @@
 #include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <functional>
@@ -754,22 +755,32 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool Ascending,
           typename T,
           typename IdxT>
-__launch_bounds__(256) RAFT_KERNEL
-  block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
+__launch_bounds__(256) RAFT_KERNEL block_kernel(const T* in,
+                                                const IdxT* in_idx,
+                                                const IdxT* in_indptr,
+                                                size_t offset,
+                                                IdxT len,
+                                                int k,
+                                                T* out,
+                                                IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
   using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
   uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
   bq_t queue(k, warp_smem);
+  const size_t batch_id = blockIdx.y;
 
-  in += blockIdx.y * len;
-  if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
+  const IdxT l_len    = in_indptr ? (in_indptr[batch_id + 1] - in_indptr[batch_id]) : len;
+  const IdxT l_offset = in_indptr ? in_indptr[batch_id] : (offset + batch_id) * len;
+
+  in += l_offset;
+  if (in_idx != nullptr) { in_idx += l_offset; }
 
   const IdxT stride         = gridDim.x * blockDim.x;
-  const IdxT per_thread_lim = len + laneId();
+  const IdxT per_thread_lim = l_len + laneId();
   for (IdxT i = threadIdx.x + blockIdx.x * blockDim.x; i < per_thread_lim; i += stride) {
-    queue.add(i < len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
-              (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
+    queue.add(i < l_len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
+              (i < l_len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
   queue.done(smem_buf_bytes);
@@ -832,6 +843,7 @@ struct launch_setup {
                      int smem_size,
                      const T* in_key,
                      const IdxT* in_idx,
+                     const IdxT* in_indptr,
                      T* out_key,
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
@@ -848,6 +860,7 @@ struct launch_setup {
                                                                           smem_size,
                                                                           in_key,
                                                                           in_idx,
+                                                                          in_indptr,
                                                                           out_key,
                                                                           out_idx,
                                                                           stream);
@@ -858,21 +871,23 @@ struct launch_setup {
     // This is less than cuda's max block dim along Y axis (65535), but it's a
     // power-of-two, which ensures the alignment of batches in memory.
     constexpr size_t kMaxGridDimY = 32768;
+    size_t g_offset               = 0;
     for (size_t offset = 0; offset < batch_size; offset += kMaxGridDimY) {
       size_t batch_chunk = std::min<size_t>(kMaxGridDimY, batch_size - offset);
       dim3 gs(num_blocks, batch_chunk, 1);
       if (select_min) {
-        block_kernel<WarpSortClass, Capacity, true, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, true, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       } else {
-        block_kernel<WarpSortClass, Capacity, false, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, false, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       }
       RAFT_CUDA_TRY(cudaPeekAtLastError());
       out_key += batch_chunk * num_blocks * k;
       out_idx += batch_chunk * num_blocks * k;
-      in_key += batch_chunk * len;
-      if (in_idx != nullptr) { in_idx += batch_chunk * len; }
+
+      if (in_indptr != nullptr) { in_indptr += batch_chunk; };
+      g_offset += batch_chunk;
     }
   }
 };
@@ -1010,6 +1025,7 @@ void select_k_(int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
+               const IdxT* in_indptr,
                size_t batch_size,
                size_t len,
                int k,
@@ -1017,7 +1033,7 @@ void select_k_(int num_of_block,
                IdxT* out_idx,
                bool select_min,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
@@ -1041,6 +1057,7 @@ void select_k_(int num_of_block,
                                                smem_size,
                                                in,
                                                in_idx,
+                                               in_indptr,
                                                result_val,
                                                result_idx,
                                                stream);
@@ -1056,6 +1073,7 @@ void select_k_(int num_of_block,
                                                  smem_size,
                                                  tmp_val.data(),
                                                  tmp_idx.data(),
+                                                 nullptr,
                                                  out,
                                                  out_idx,
                                                  stream);
@@ -1071,7 +1089,8 @@ void select_k_impl(raft::resources const& res,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min)
+                   bool select_min,
+                   const IdxT* in_indptr = nullptr)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
@@ -1082,6 +1101,7 @@ void select_k_impl(raft::resources const& res,
                                     num_of_warp,
                                     in,
                                     in_idx,
+                                    in_indptr,
                                     batch_size,
                                     len,
                                     k,
@@ -1126,6 +1146,9 @@ void select_k_impl(raft::resources const& res,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] in_indptr
+ *   CSR indptr of the index matrix, which indicates the length for each row.
+ *   `nullptr` by default, under this situation, @p len is used as the length.
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& res,
@@ -1136,7 +1159,8 @@ void select_k(raft::resources const& res,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min)
+              bool select_min,
+              const IdxT* in_indptr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1155,6 +1179,7 @@ void select_k(raft::resources const& res,
                                             num_of_warp,
                                             in,
                                             in_idx,
+                                            in_indptr,
                                             batch_size,
                                             len,
                                             k,
@@ -1170,6 +1195,7 @@ void select_k(raft::resources const& res,
                                            num_of_warp,
                                            in,
                                            in_idx,
+                                           in_indptr,
                                            batch_size,
                                            len,
                                            k,
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 598ac60faf..d1e8586a24 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,11 @@
  * Please use versions in individual header files instead.
  */
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use versions in individual header files instead.")
+#endif
 
 #ifndef __MATH_H
 #define __MATH_H
@@ -488,4 +490,4 @@ void matrixVectorBinarySub(Type* data,
 };  // end namespace matrix
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 29cc6fb75b..096de1b42d 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -19,9 +19,11 @@
  * Please use versions in individual header files instead.
  */
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use versions in individual header files instead.")
+#endif
 
 #ifndef __MATRIX_H
 #define __MATRIX_H
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 53bd30d2eb..f6b06264ae 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the cuh version instead.")
+#endif
 
 #include "matrix.cuh"
diff --git a/cpp/include/raft/matrix/specializations.cuh b/cpp/include/raft/matrix/specializations.cuh
index ac3b80e8d9..c61d65dcaf 100644
--- a/cpp/include/raft/matrix/specializations.cuh
+++ b/cpp/include/raft/matrix/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/matrix/specializations/detail/select_k.cuh b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
index ac3b80e8d9..c61d65dcaf 100644
--- a/cpp/include/raft/matrix/specializations/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index b7e362f704..5263ef73e7 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -24,6 +24,7 @@
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/neighbors/dataset.hpp>
 
@@ -48,13 +49,14 @@ namespace raft::neighbors::cagra {
  *
  * The following distance metrics are supported:
  * - L2Expanded
+ * - InnerProduct
  *
  * Usage example:
  * @code{.cpp}
  *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_pq::index_params build_params;
- *   ivf_pq::search_params search_params
+ *   // use default index parameters based on shape of the dataset
+ *   ivf_pq::index_params build_params = ivf_pq::index_params::from_dataset(dataset);
+ *   ivf_pq::search_params search_params;
  *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // create knn graph
  *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
diff --git a/cpp/include/raft/neighbors/cagra_serialize.cuh b/cpp/include/raft/neighbors/cagra_serialize.cuh
index 83830c7457..eae2269662 100644
--- a/cpp/include/raft/neighbors/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/cagra_serialize.cuh
@@ -32,14 +32,14 @@ namespace raft::neighbors::cagra {
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create an output stream
  * std::ostream os(std::cout.rdbuf());
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize(handle, os, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize(handle, os, index);
  * @endcode
  *
  * @tparam T data element type
@@ -67,14 +67,14 @@ void serialize(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create a string with a filepath
  * std::string filename("/path/to/index");
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize(handle, filename, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize(handle, filename, index);
  * @endcode
  *
  * @tparam T data element type
@@ -102,14 +102,14 @@ void serialize(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create an output stream
  * std::ostream os(std::cout.rdbuf());
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize_to_hnswlib(handle, os, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize_to_hnswlib(handle, os, index);
  * @endcode
  *
  * @tparam T data element type
@@ -135,14 +135,14 @@ void serialize_to_hnswlib(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create a string with a filepath
  * std::string filename("/path/to/index");
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize_to_hnswlib(handle, filename, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize_to_hnswlib(handle, filename, index);
  * @endcode
  *
  * @tparam T data element type
@@ -168,7 +168,7 @@ void serialize_to_hnswlib(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
@@ -176,7 +176,7 @@ void serialize_to_hnswlib(raft::resources const& handle,
  * std::istream is(std::cin.rdbuf());
  * using T    = float; // data element type
  * using IdxT = int; // type of the index
- * auto index = raft::cagra::deserialize<T, IdxT>(handle, is);
+ * auto index = raft::neighbors::cagra::deserialize<T, IdxT>(handle, is);
  * @endcode
  *
  * @tparam T data element type
@@ -200,7 +200,7 @@ index<T, IdxT> deserialize(raft::resources const& handle, std::istream& is)
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
@@ -208,7 +208,7 @@ index<T, IdxT> deserialize(raft::resources const& handle, std::istream& is)
  * std::string filename("/path/to/index");
  * using T    = float; // data element type
  * using IdxT = int; // type of the index
- * auto index = raft::cagra::deserialize<T, IdxT>(handle, filename);
+ * auto index = raft::neighbors::cagra::deserialize<T, IdxT>(handle, filename);
  * @endcode
  *
  * @tparam T data element type
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 807f89fd65..97c9c0d098 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -205,8 +205,8 @@ struct index : ann::index {
         raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
       metric_(metric),
-      dataset_(new neighbors::empty_dataset<int64_t>(0)),
-      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
+      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
+      dataset_(new neighbors::empty_dataset<int64_t>(0))
   {
   }
 
@@ -271,8 +271,8 @@ struct index : ann::index {
         mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
     : ann::index(),
       metric_(metric),
-      dataset_(make_aligned_dataset(res, dataset, 16)),
-      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
+      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
+      dataset_(make_aligned_dataset(res, dataset, 16))
   {
     RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
                  "Dataset and knn_graph must have equal number of rows");
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index d91e45257e..40dcf68e68 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -21,6 +21,7 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
@@ -34,6 +35,8 @@
 #include <raft/neighbors/refine.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <chrono>
 #include <cstdio>
 #include <vector>
@@ -48,8 +51,9 @@ void build_knn_graph(raft::resources const& res,
                      std::optional<ivf_pq::index_params> build_params   = std::nullopt,
                      std::optional<ivf_pq::search_params> search_params = std::nullopt)
 {
-  RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded,
-               "Currently only L2Expanded metric is supported");
+  RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded ||
+                 build_params->metric == distance::DistanceType::InnerProduct,
+               "Currently only L2Expanded or InnerProduct metric are supported");
 
   uint32_t node_degree = knn_graph.extent(1);
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::build_graph(%zu, %zu, %u)",
@@ -57,15 +61,7 @@ void build_knn_graph(raft::resources const& res,
                                                             size_t(dataset.extent(1)),
                                                             node_degree);
 
-  if (!build_params) {
-    build_params          = ivf_pq::index_params{};
-    build_params->n_lists = dataset.extent(0) < 4 * 2500 ? 4 : (uint32_t)(dataset.extent(0) / 2500);
-    build_params->pq_dim  = raft::Pow2<8>::roundUp(dataset.extent(1) / 2);
-    build_params->pq_bits = 8;
-    build_params->kmeans_trainset_fraction = dataset.extent(0) < 10000 ? 1 : 10;
-    build_params->kmeans_n_iters           = 25;
-    build_params->add_data_on_build        = true;
-  }
+  if (!build_params) { build_params = ivf_pq::index_params::from_dataset(dataset); }
 
   // Make model name
   const std::string model_name = [&]() {
@@ -124,7 +120,7 @@ void build_knn_graph(raft::resources const& res,
   bool first                    = true;
   const auto start_clock        = std::chrono::system_clock::now();
 
-  rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(res);
+  rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res);
 
   raft::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
     dataset.data_handle(),
@@ -322,8 +318,10 @@ index<T, IdxT> build(
 
   if (params.build_algo == graph_build_algo::IVF_PQ) {
     build_knn_graph(res, dataset, knn_graph->view(), refine_rate, pq_build_params, search_params);
-
   } else {
+    RAFT_EXPECTS(
+      params.metric == raft::distance::DistanceType::L2Expanded,
+      "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent");
     // Use nn-descent to build CAGRA knn graph
     if (!nn_descent_params) {
       nn_descent_params                            = experimental::nn_descent::index_params();
@@ -346,6 +344,8 @@ index<T, IdxT> build(
   // Construct an index from dataset and optimized knn graph.
   if (construct_index_with_dataset) {
     if (params.compression.has_value()) {
+      RAFT_EXPECTS(params.metric == raft::distance::DistanceType::L2Expanded,
+                   "VPQ compression is only supported with L2Expanded distance mertric");
       index<T, IdxT> idx(res, params.metric);
       idx.update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
       idx.update_dataset(
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index d30f69ddcd..67fad2e46a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -26,6 +26,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/neighbors/detail/ivf_common.cuh>
 #include <raft/neighbors/detail/ivf_pq_search.cuh>
@@ -87,11 +88,12 @@ void search_main_core(
   raft::device_matrix_view<const typename DatasetDescriptorT::DATA_T, int64_t, row_major> queries,
   raft::device_matrix_view<typename DatasetDescriptorT::INDEX_T, int64_t, row_major> neighbors,
   raft::device_matrix_view<typename DatasetDescriptorT::DISTANCE_T, int64_t, row_major> distances,
-  CagraSampleFilterT sample_filter = CagraSampleFilterT())
+  CagraSampleFilterT sample_filter    = CagraSampleFilterT(),
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
 {
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
-                 static_cast<size_t>(index.data().n_rows()),
-                 static_cast<size_t>(index.data().dim()));
+                 static_cast<size_t>(dataset_desc.size),
+                 static_cast<size_t>(dataset_desc.dim));
   RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
                  static_cast<size_t>(queries.extent(0)),
                  static_cast<size_t>(queries.extent(1)));
@@ -112,7 +114,7 @@ void search_main_core(
   using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
   std::unique_ptr<search_plan_impl<DatasetDescriptorT, CagraSampleFilterT_s>> plan =
     factory<DatasetDescriptorT, CagraSampleFilterT_s>::create(
-      res, params, dataset_desc.dim, graph.extent(1), topk);
+      res, params, dataset_desc.dim, graph.extent(1), topk, metric);
 
   plan->check(topk);
 
@@ -163,10 +165,12 @@ void launch_vpq_search_main_core(
   raft::device_matrix_view<const T, int64_t, row_major> queries,
   raft::device_matrix_view<InternalIdxT, int64_t, row_major> neighbors,
   raft::device_matrix_view<DistanceT, int64_t, row_major> distances,
-  CagraSampleFilterT sample_filter)
+  CagraSampleFilterT sample_filter,
+  const raft::distance::DistanceType metric)
 {
   RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now");
-  RAFT_EXPECTS(vpq_dset->pq_len() == 2, "Only pq_len 2 is supported for now");
+  RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4,
+               "Only pq_len 2 or 4 is supported for now");
   RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0,
                "dim must be a multiple of pq_dim at the moment");
 
@@ -191,7 +195,7 @@ void launch_vpq_search_main_core(
                                   size_t(vpq_dset->n_rows()),
                                   vpq_dset->dim());
       search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter);
+        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
     } else if (vpq_dset->pq_len() == 4) {
       using dataset_desc_t = cagra_q_dataset_descriptor_t<T,
                                                           DatasetT,
@@ -209,7 +213,7 @@ void launch_vpq_search_main_core(
                                   size_t(vpq_dset->n_rows()),
                                   vpq_dset->dim());
       search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter);
+        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
     } else {
       RAFT_FAIL("Subspace dimension must be 2 or 4");
     }
@@ -267,9 +271,15 @@ void search_main(raft::resources const& res,
                                       strided_dset->n_rows(),
                                       strided_dset->dim(),
                                       strided_dset->stride());
-
-    search_main_core<dataset_desc_t, CagraSampleFilterT>(
-      res, params, dataset_desc, graph_internal, queries, neighbors, distances, sample_filter);
+    search_main_core<dataset_desc_t, CagraSampleFilterT>(res,
+                                                         params,
+                                                         dataset_desc,
+                                                         graph_internal,
+                                                         queries,
+                                                         neighbors,
+                                                         distances,
+                                                         sample_filter,
+                                                         index.metric());
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<float, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
     // Search using a compressed dataset
@@ -277,7 +287,15 @@ void search_main(raft::resources const& res,
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
     launch_vpq_search_main_core<T, half, ds_idx_type, InternalIdxT, DistanceT, CagraSampleFilterT>(
-      res, vpq_dset, params, graph_internal, queries, neighbors, distances, sample_filter);
+      res,
+      vpq_dset,
+      params,
+      graph_internal,
+      queries,
+      neighbors,
+      distances,
+      sample_filter,
+      index.metric());
   } else if (auto* empty_dset = dynamic_cast<const empty_dataset<ds_idx_type>*>(&index.data());
              empty_dset != nullptr) {
     // Forgot to add a dataset.
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
index 49e14be73d..80ee7a36f1 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
@@ -19,6 +19,8 @@
 #include "hashmap.hpp"
 #include "utils.hpp"
 
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -54,6 +56,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
   const uint32_t num_seeds,
   INDEX_T* const visited_hash_ptr,
   const uint32_t hash_bitlen,
+  const raft::distance::DistanceType metric,
   const uint32_t block_id   = 0,
   const uint32_t num_blocks = 1)
 {
@@ -78,8 +81,22 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
         }
       }
 
-      const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-        query_buffer, seed_index, valid_i);
+      DISTANCE_T norm2;
+      switch (metric) {
+        case raft::distance::L2Expanded:
+          norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                           TEAM_SIZE,
+                                                           raft::distance::L2Expanded>(
+            query_buffer, seed_index, valid_i);
+          break;
+        case raft::distance::InnerProduct:
+          norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                           TEAM_SIZE,
+                                                           raft::distance::InnerProduct>(
+            query_buffer, seed_index, valid_i);
+          break;
+        default: break;
+      }
 
       if (valid_i && (norm2 < best_norm2_team_local)) {
         best_norm2_team_local = norm2;
@@ -121,7 +138,8 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(
   const std::uint32_t hash_bitlen,
   const INDEX_T* const parent_indices,
   const INDEX_T* const internal_topk_list,
-  const std::uint32_t search_width)
+  const std::uint32_t search_width,
+  const raft::distance::DistanceType metric)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
@@ -153,8 +171,22 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(
     INDEX_T child_id   = invalid_index;
     if (valid_i) { child_id = result_child_indices_ptr[i]; }
 
-    const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-      query_buffer, child_id, child_id != invalid_index);
+    DISTANCE_T norm2;
+    switch (metric) {
+      case raft::distance::L2Expanded:
+        norm2 =
+          dataset_desc
+            .template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE, raft::distance::L2Expanded>(
+              query_buffer, child_id, child_id != invalid_index);
+        break;
+      case raft::distance::InnerProduct:
+        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                         TEAM_SIZE,
+                                                         raft::distance::InnerProduct>(
+          query_buffer, child_id, child_id != invalid_index);
+        break;
+      default: break;
+    }
 
     // Store the distance
     const unsigned lane_id = threadIdx.x % TEAM_SIZE;
@@ -220,7 +252,22 @@ struct standard_dataset_descriptor_t
     }
   }
 
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE>
+  template <typename T, raft::distance::DistanceType METRIC>
+  std::enable_if_t<METRIC == raft::distance::DistanceType::L2Expanded, T> __device__
+  dist_op(T a, T b) const
+  {
+    T diff = a - b;
+    return diff * diff;
+  }
+
+  template <typename T, raft::distance::DistanceType METRIC>
+  std::enable_if_t<METRIC == raft::distance::DistanceType::InnerProduct, T> __device__
+  dist_op(T a, T b) const
+  {
+    return -a * b;
+  }
+
+  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, raft::distance::DistanceType METRIC>
   __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
                                            const INDEX_T dataset_i,
                                            const bool valid) const
@@ -252,9 +299,9 @@ struct standard_dataset_descriptor_t
             // because:
             // - Above the last element (dataset_dim-1), the query array is filled with zeros.
             // - The data buffer has to be also padded with zeros.
-            DISTANCE_T diff = query_ptr[device::swizzling(kv)];
-            diff -= spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]);
-            norm2 += diff * diff;
+            DISTANCE_T d = query_ptr[device::swizzling(kv)];
+            norm2 += dist_op<DISTANCE_T, METRIC>(
+              d, spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]));
           }
         }
       }
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
index 0204addba7..c922a0d7f4 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
@@ -18,6 +18,7 @@
 
 #include "compute_distance.hpp"
 
+#include <raft/distance/distance_types.hpp>
 #include <raft/util/integer_utils.hpp>
 
 namespace raft::neighbors::cagra::detail {
@@ -33,6 +34,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   using CODE_BOOK_T = CODE_BOOK_T_;
   using QUERY_T     = typename dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::QUERY_T;
 
+  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
+
   const std::uint8_t* encoded_dataset_ptr;
   const std::uint32_t encoded_dataset_dim;
   const std::uint32_t n_subspace;
@@ -53,18 +56,19 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
     smem_pq_code_book_ptr = reinterpret_cast<CODE_BOOK_T*>(smem_ptr);
 
     // Copy PQ table
-    if constexpr (std::is_same<CODE_BOOK_T, half>::value) {
-      for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-        half2 buf2;
-        buf2.x                                                   = pq_code_book_ptr[i];
-        buf2.y                                                   = pq_code_book_ptr[i + 1];
-        (reinterpret_cast<half2*>(smem_pq_code_book_ptr + i))[0] = buf2;
-      }
-    } else {
-      for (unsigned i = threadIdx.x; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x) {
-        // TODO: vectorize
-        smem_pq_code_book_ptr[i] = pq_code_book_ptr[i];
-      }
+    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
+      half2 buf2;
+      buf2.x = pq_code_book_ptr[i];
+      buf2.y = pq_code_book_ptr[i + 1];
+
+      // Change the order of PQ code book array to reduce the
+      // frequency of bank conflicts.
+      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
+      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+      const auto j                          = i / num_elements_per_bank;
+      const auto smem_index =
+        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+      reinterpret_cast<half2*>(smem_pq_code_book_ptr)[smem_index] = buf2;
     }
   }
 
@@ -109,7 +113,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
     }
   }
 
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE>
+  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, raft::distance::DistanceType METRIC>
   __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
                                            const INDEX_T node_id,
                                            const bool valid) const
@@ -136,7 +140,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
               4 + k));
           }
           //
-          if constexpr ((std::is_same<CODE_BOOK_T, half>::value) && (PQ_LEN % 2 == 0)) {
+          if constexpr (PQ_LEN % 2 == 0) {
             // **** Use half2 for distance computation ****
             half2 norm2{0, 0};
 #pragma unroll
@@ -224,4 +228,4 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   }
 };
 
-}  // namespace raft::neighbors::cagra::detail
+}  // namespace raft::neighbors::cagra::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/cagra/factory.cuh b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
index 4944b57c46..6d7fc6c966 100644
--- a/cpp/include/raft/neighbors/detail/cagra/factory.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
@@ -41,9 +41,10 @@ class factory {
     search_params const& params,
     int64_t dim,
     int64_t graph_degree,
-    uint32_t topk)
+    uint32_t topk,
+    const raft::distance::DistanceType metric)
   {
-    search_plan_impl_base plan(params, dim, graph_degree, topk);
+    search_plan_impl_base plan(params, dim, graph_degree, topk, metric);
     switch (plan.dataset_block_dim) {
       case 128:
         switch (plan.team_size) {
@@ -77,17 +78,17 @@ class factory {
       return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
         new single_cta_search::
           search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk));
+            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
     } else if (plan.algo == search_algo::MULTI_CTA) {
       return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
         new multi_cta_search::
           search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk));
+            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
     } else {
       return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
         new multi_kernel_search::
           search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk));
+            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
     }
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
index 8192b1ae51..4b979bcae8 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
@@ -24,11 +24,14 @@
 #include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
 #include "utils.hpp"
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
@@ -96,8 +99,10 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk),
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric),
       intermediate_indices(0, resource::get_cuda_stream(res)),
       intermediate_distances(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res))
@@ -235,6 +240,7 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       min_iterations,
       max_iterations,
       sample_filter,
+      this->metric,
       stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
index 50f9e69593..35f4f0e1c9 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
@@ -54,6 +54,7 @@ void select_and_run(
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream) RAFT_EXPLICIT;
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
@@ -88,6 +89,7 @@ void select_and_run(
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_kernel_selection(
@@ -172,6 +174,7 @@ instantiate_kernel_selection(
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_q_kernel_selection(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 48c22d9d14..cfbb1e100c 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -28,6 +28,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
@@ -149,7 +150,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const uint32_t min_iteration,
   const uint32_t max_iteration,
   uint32_t* const num_executed_iterations, /* stats */
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const raft::distance::DistanceType metric)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -227,6 +229,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
   uint32_t block_id                   = cta_id + (num_cta_per_query * query_id);
   uint32_t num_blocks                 = num_cta_per_query * num_queries;
+
   device::compute_distance_to_random_nodes<TEAM_SIZE, DATASET_BLOCK_DIM>(result_indices_buffer,
                                                                          result_distances_buffer,
                                                                          query_buffer,
@@ -238,6 +241,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
                                                                          num_seeds,
                                                                          local_visited_hashmap_ptr,
                                                                          hash_bitlen,
+                                                                         metric,
                                                                          block_id,
                                                                          num_blocks);
   __syncthreads();
@@ -282,7 +286,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
       hash_bitlen,
       parent_indices_buffer,
       result_indices_buffer,
-      search_width);
+      search_width,
+      metric);
     _CLK_REC(clk_compute_distance);
     __syncthreads();
 
@@ -459,6 +464,7 @@ void select_and_run(
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream)
 {
   auto kernel =
@@ -484,6 +490,7 @@ void select_and_run(
                  num_cta_per_query,
                  num_queries,
                  smem_size);
+
   kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
                                                        topk_distances_ptr,
                                                        dataset_desc,
@@ -501,7 +508,8 @@ void select_and_run(
                                                        min_iterations,
                                                        max_iterations,
                                                        num_executed_iterations,
-                                                       sample_filter);
+                                                       sample_filter,
+                                                       metric);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index 10788da432..31c4bc5dca 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -27,6 +27,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/matrix/select_k.cuh>
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -100,7 +101,8 @@ RAFT_KERNEL random_pickup_kernel(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
   const std::uint32_t ldr,                                                // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen)
+  const std::uint32_t hash_bitlen,
+  const raft::distance::DistanceType metric)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -137,8 +139,22 @@ RAFT_KERNEL random_pickup_kernel(
         device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc.size;
     }
 
-    const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-      query_buffer, seed_index, true);
+    DISTANCE_T norm2;
+    switch (metric) {
+      case distance::DistanceType::L2Expanded:
+        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                         TEAM_SIZE,
+                                                         distance::DistanceType::L2Expanded>(
+          query_buffer, seed_index, true);
+        break;
+      case distance::DistanceType::InnerProduct:
+        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                         TEAM_SIZE,
+                                                         distance::DistanceType::InnerProduct>(
+          query_buffer, seed_index, true);
+        break;
+      default: break;
+    }
 
     if (norm2 < best_norm2_team_local) {
       best_norm2_team_local = norm2;
@@ -175,6 +191,7 @@ void random_pickup(
   const std::size_t ldr,                                                  // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
   const std::uint32_t hash_bitlen,
+  const raft::distance::DistanceType metric,
   cudaStream_t const cuda_stream = 0)
 {
   const auto block_size                = 256u;
@@ -198,7 +215,8 @@ void random_pickup(
                                                         result_distances_ptr,
                                                         ldr,
                                                         visited_hashmap_ptr,
-                                                        hash_bitlen);
+                                                        hash_bitlen,
+                                                        metric);
 }
 
 template <class INDEX_T>
@@ -325,7 +343,8 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const raft::distance::DistanceType metric)
 {
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
@@ -371,8 +390,22 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   const auto compute_distance_flag = hashmap::insert<TEAM_SIZE, INDEX_T>(
     visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id);
 
-  const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-    query_buffer, child_id, compute_distance_flag);
+  DISTANCE_T norm2;
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                       TEAM_SIZE,
+                                                       raft::distance::DistanceType::L2Expanded>(
+        query_buffer, child_id, compute_distance_flag);
+      break;
+    case raft::distance::DistanceType::InnerProduct:
+      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                       TEAM_SIZE,
+                                                       raft::distance::DistanceType::InnerProduct>(
+        query_buffer, child_id, compute_distance_flag);
+      break;
+    default: break;
+  }
 
   if (compute_distance_flag) {
     if (threadIdx.x % TEAM_SIZE == 0) {
@@ -421,6 +454,7 @@ void compute_distance_to_child_nodes(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
   SAMPLE_FILTER_T sample_filter,
+  const raft::distance::DistanceType metric,
   cudaStream_t cuda_stream = 0)
 {
   const auto block_size = 128;
@@ -452,7 +486,8 @@ void compute_distance_to_child_nodes(
                                                         result_indices_ptr,
                                                         result_distances_ptr,
                                                         ldd,
-                                                        sample_filter);
+                                                        sample_filter,
+                                                        metric);
 }
 
 template <class INDEX_T>
@@ -660,8 +695,10 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk),
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric),
       result_indices(0, resource::get_cuda_stream(res)),
       result_distances(0, resource::get_cuda_stream(res)),
       parent_node_list(0, resource::get_cuda_stream(res)),
@@ -835,6 +872,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
                                                 result_buffer_allocation_size,
                                                 hashmap.data(),
                                                 hash_bitlen,
+                                                this->metric,
                                                 stream);
 
     unsigned iter = 0;
@@ -904,6 +942,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
         result_distances.data() + itopk_size,
         result_buffer_allocation_size,
         sample_filter,
+        this->metric,
         stream);
 
       iter++;
@@ -1020,8 +1059,10 @@ struct search<TEAM_SIZE,
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk)
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric)
   {
     THROW("The multi-kernel mode does not support VPQ");
   }
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index be5ac0554f..b35d96e9f5 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -25,6 +25,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/util/pow2_utils.cuh>
 
@@ -35,8 +36,13 @@ struct search_plan_impl_base : public search_params {
   int64_t dim;
   int64_t graph_degree;
   uint32_t topk;
-  search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk)
-    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk)
+  raft::distance::DistanceType metric;
+  search_plan_impl_base(search_params params,
+                        int64_t dim,
+                        int64_t graph_degree,
+                        uint32_t topk,
+                        raft::distance::DistanceType metric)
+    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric)
   {
     set_dataset_block_and_team_size(dim);
     if (algo == search_algo::AUTO) {
@@ -97,8 +103,9 @@ struct search_plan_impl : public search_plan_impl_base {
                    search_params params,
                    int64_t dim,
                    int64_t graph_degree,
-                   uint32_t topk)
-    : search_plan_impl_base(params, dim, graph_degree, topk),
+                   uint32_t topk,
+                   raft::distance::DistanceType metric)
+    : search_plan_impl_base(params, dim, graph_degree, topk, metric),
       hashmap(0, resource::get_cuda_stream(res)),
       num_executed_iterations(0, resource::get_cuda_stream(res)),
       dev_seed(0, resource::get_cuda_stream(res)),
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index 4430b929fb..0771652787 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -94,8 +94,10 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk)
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric)
   {
     set_params(res);
   }
@@ -244,6 +246,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       min_iterations,
       max_iterations,
       sample_filter,
+      this->metric,
       stream);
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
index a836334667..510219ab5d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
@@ -54,6 +54,7 @@ void select_and_run(  // raft::resources const& res,
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream) RAFT_EXPLICIT;
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -90,6 +91,7 @@ void select_and_run(  // raft::resources const& res,
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_single_cta_select_and_run(
@@ -175,6 +177,7 @@ instantiate_single_cta_select_and_run(
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_q_single_cta_select_and_run(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index a697f9512c..e8104bd6f6 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -29,6 +29,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
@@ -485,7 +486,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric)
 {
   using LOAD_T = device::LOAD_128BIT_T;
 
@@ -581,7 +583,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
                                                                          local_seed_ptr,
                                                                          num_seeds,
                                                                          local_visited_hashmap_ptr,
-                                                                         hash_bitlen);
+                                                                         hash_bitlen,
+                                                                         metric);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -718,7 +721,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
       hash_bitlen,
       parent_list_buffer,
       result_indices_buffer,
-      search_width);
+      search_width,
+      metric);
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
@@ -930,6 +934,7 @@ void select_and_run(
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream)
 {
   auto kernel =
@@ -962,7 +967,8 @@ void select_and_run(
                                                          hash_bitlen,
                                                          small_hash_bitlen,
                                                          small_hash_reset_interval,
-                                                         sample_filter);
+                                                         sample_filter,
+                                                         metric);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 }  // namespace single_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 265cbfdceb..ece95a7cb7 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -20,7 +20,7 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/util/integer_utils.hpp>
 
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda.h>
 #include <cuda_fp16.h>
@@ -261,9 +261,8 @@ template <typename T, typename data_accessor>
 void copy_with_padding(raft::resources const& res,
                        raft::device_matrix<T, int64_t, row_major>& dst,
                        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> src,
-                       rmm::mr::device_memory_resource* mr = nullptr)
+                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
-  if (!mr) { mr = rmm::mr::get_current_device_resource(); }
   size_t padded_dim = round_up_safe<size_t>(src.extent(1) * sizeof(T), 16) / sizeof(T);
 
   if ((dst.extent(0) != src.extent(0)) || (static_cast<size_t>(dst.extent(1)) != padded_dim)) {
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
index 350b82ede7..c14b0e810f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -20,6 +20,8 @@
 #include <raft/neighbors/sample_filter_types.hpp>  // none_ivf_sample_filter
 #include <raft/util/raft_explicit.hpp>             // RAFT_EXPLICIT
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda_fp16.h>
 
 #include <cstdint>  // uintX_t
@@ -37,8 +39,8 @@ void search(raft::resources const& handle,
             uint32_t k,
             IdxT* neighbors,
             float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr,
-            IvfSampleFilterT sample_filter      = IvfSampleFilterT()) RAFT_EXPLICIT;
+            rmm::device_async_resource_ref mr,
+            IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT;
 
 }  // namespace raft::neighbors::ivf_flat::detail
 
@@ -54,7 +56,7 @@ void search(raft::resources const& handle,
     uint32_t k,                                                                      \
     IdxT* neighbors,                                                                 \
     float* distances,                                                                \
-    rmm::mr::device_memory_resource* mr,                                             \
+    rmm::device_async_resource_ref mr,                                               \
     IvfSampleFilterT sample_filter)
 
 instantiate_raft_neighbors_ivf_flat_detail_search(
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 441fb76b2f..388dd60f14 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -30,7 +30,7 @@
 #include <raft/neighbors/sample_filter_types.hpp>               // none_ivf_sample_filter
 #include <raft/spatial/knn/detail/ann_utils.cuh>                // utils::mapping
 
-#include <rmm/mr/device/per_device_resource.hpp>  // rmm::device_memory_resource
+#include <rmm/resource_ref.hpp>
 
 namespace raft::neighbors::ivf_flat::detail {
 
@@ -48,7 +48,7 @@ void search_impl(raft::resources const& handle,
                  bool select_min,
                  IdxT* neighbors,
                  AccT* distances,
-                 rmm::mr::device_memory_resource* search_mr,
+                 rmm::device_async_resource_ref search_mr,
                  IvfSampleFilterT sample_filter)
 {
   auto stream = resource::get_cuda_stream(handle);
@@ -276,13 +276,12 @@ inline void search(raft::resources const& handle,
                    uint32_t k,
                    IdxT* neighbors,
                    float* distances,
-                   rmm::mr::device_memory_resource* mr = nullptr,
-                   IvfSampleFilterT sample_filter      = IvfSampleFilterT())
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
+                   IvfSampleFilterT sample_filter    = IvfSampleFilterT())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
   RAFT_EXPECTS(params.n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
   auto n_probes          = std::min<uint32_t>(params.n_probes, index.n_lists());
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 8e3f7dbaf3..24574642ef 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -49,6 +49,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_fp16.h>
 #include <thrust/extrema.h>
@@ -171,7 +172,7 @@ void select_residuals(raft::resources const& handle,
                       const float* center,           // [dim]
                       const T* dataset,              // [.., dim]
                       const IdxT* row_ids,           // [n_rows]
-                      rmm::mr::device_memory_resource* device_memory
+                      rmm::device_async_resource_ref device_memory
 
 )
 {
@@ -225,7 +226,7 @@ void flat_compute_residuals(
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
   const T* dataset,                                                      // [n_rows, dim]
   std::variant<uint32_t, const uint32_t*> labels,                        // [n_rows]
-  rmm::mr::device_memory_resource* device_memory)
+  rmm::device_async_resource_ref device_memory)
 {
   auto stream  = resource::get_cuda_stream(handle);
   auto dim     = rotation_matrix.extent(1);
@@ -397,7 +398,7 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
-                      rmm::mr::device_memory_resource* managed_memory)
+                      rmm::device_async_resource_ref managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
@@ -475,7 +476,7 @@ void train_per_cluster(raft::resources const& handle,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
-                       rmm::mr::device_memory_resource* managed_memory)
+                       rmm::device_async_resource_ref managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
@@ -1325,7 +1326,7 @@ void process_and_fill_codes(raft::resources const& handle,
                             std::variant<IdxT, const IdxT*> src_offset_or_indices,
                             const uint32_t* new_labels,
                             IdxT n_rows,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   auto new_vectors_residual =
     make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
@@ -1516,7 +1517,7 @@ void extend(raft::resources const& handle,
                   std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
-  rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(handle);
+  rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle);
 
   // The spec defines how the clusters look like
   auto spec = list_spec<uint32_t, IdxT>{
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4c5da38092..87e6d0a774 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -45,8 +45,7 @@
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_fp16.h>
@@ -76,7 +75,7 @@ void select_clusters(raft::resources const& handle,
                      raft::distance::DistanceType metric,
                      const T* queries,              // [n_queries, dim]
                      const float* cluster_centers,  // [n_lists, dim_ext]
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)",
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index adcb639301..daa2798b00 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -38,7 +38,6 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/cuda_device.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
index f1321ba343..f6cd2a1ceb 100644
--- a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
+++ b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
@@ -81,7 +81,7 @@ auto fill_missing_params_heuristics(const vpq_params& params, const DatasetT& da
   vpq_params r  = params;
   double n_rows = dataset.extent(0);
   size_t dim    = dataset.extent(1);
-  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{2}); }
+  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{4}); }
   if (r.pq_bits == 0) { r.pq_bits = 8; }
   if (r.vq_n_centers == 0) { r.vq_n_centers = raft::round_up_safe<uint32_t>(std::sqrt(n_rows), 8); }
   if (r.vq_kmeans_trainset_fraction == 0) {
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index a1783dfcfd..12ab0dc3a6 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -22,7 +22,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>  // raft::neighbors::ivf_flat::index
 #include <raft/util/raft_explicit.hpp>        // RAFT_EXPLICIT
 
-#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
+#include <rmm/resource_ref.hpp>
 
 #include <cstdint>  // int64_t
 
@@ -109,8 +109,8 @@ void search_with_filtering(raft::resources const& handle,
                            uint32_t k,
                            IdxT* neighbors,
                            float* distances,
-                           rmm::mr::device_memory_resource* mr = nullptr,
-                           IvfSampleFilterT sample_filter      = IvfSampleFilterT()) RAFT_EXPLICIT;
+                           rmm::device_async_resource_ref mr,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT;
 
 template <typename T, typename IdxT>
 void search(raft::resources const& handle,
@@ -121,7 +121,7 @@ void search(raft::resources const& handle,
             uint32_t k,
             IdxT* neighbors,
             float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+            rmm::device_async_resource_ref mr) RAFT_EXPLICIT;
 
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
@@ -240,7 +240,7 @@ instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
     uint32_t k,                                                    \
     IdxT* neighbors,                                               \
     float* distances,                                              \
-    rmm::mr::device_memory_resource* mr);                          \
+    rmm::device_async_resource_ref mr);                            \
                                                                    \
   extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
     raft::resources const& handle,                                 \
diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index ed1d320795..ea7cff7060 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -24,7 +24,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace raft::neighbors::ivf_flat {
 
@@ -462,8 +462,8 @@ void search_with_filtering(raft::resources const& handle,
                            uint32_t k,
                            IdxT* neighbors,
                            float* distances,
-                           rmm::mr::device_memory_resource* mr = nullptr,
-                           IvfSampleFilterT sample_filter      = IvfSampleFilterT())
+                           rmm::device_async_resource_ref mr,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT())
 {
   raft::neighbors::ivf_flat::detail::search(
     handle, params, index, queries, n_queries, k, neighbors, distances, mr, sample_filter);
@@ -520,7 +520,7 @@ void search(raft::resources const& handle,
             uint32_t k,
             IdxT* neighbors,
             float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr)
+            rmm::device_async_resource_ref mr)
 {
   raft::neighbors::ivf_flat::detail::search(handle,
                                             params,
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 0908e3b0b0..7605bd82a3 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -272,10 +272,10 @@ struct index : ann::index {
       metric_(metric),
       adaptive_centers_(adaptive_centers),
       conservative_memory_allocation_{conservative_memory_allocation},
-      centers_(make_device_matrix<float, uint32_t>(res, n_lists, dim)),
-      center_norms_(std::nullopt),
       lists_{n_lists},
       list_sizes_{make_device_vector<uint32_t, uint32_t>(res, n_lists)},
+      centers_(make_device_matrix<float, uint32_t>(res, n_lists, dim)),
+      center_norms_(std::nullopt),
       data_ptrs_{make_device_vector<T*, uint32_t>(res, n_lists)},
       inds_ptrs_{make_device_vector<IdxT*, uint32_t>(res, n_lists)},
       accum_sorted_sizes_{make_host_vector<IdxT, uint32_t>(n_lists + 1)}
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 160a2753a5..620f4a244f 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -21,8 +21,6 @@
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 #include <raft/util/raft_explicit.hpp>      // RAFT_EXPLICIT
 
-#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
-
 #include <cstdint>  // int64_t
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -105,33 +103,6 @@ void search(raft::resources const& handle,
             IdxT* neighbors,
             float* distances) RAFT_EXPLICIT;
 
-template <typename T, typename IdxT, typename IvfSampleFilterT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search_with_filtering(raft::resources const& handle,
-                      const raft::neighbors::ivf_pq::search_params& params,
-                      const index<IdxT>& idx,
-                      const T* queries,
-                      uint32_t n_queries,
-                      uint32_t k,
-                      IdxT* neighbors,
-                      float* distances,
-                      rmm::mr::device_memory_resource* mr,
-                      IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT;
-
-template <typename T, typename IdxT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search(raft::resources const& handle,
-       const raft::neighbors::ivf_pq::search_params& params,
-       const index<IdxT>& idx,
-       const T* queries,
-       uint32_t n_queries,
-       uint32_t k,
-       IdxT* neighbors,
-       float* distances,
-       rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT;
-
 }  // namespace raft::neighbors::ivf_pq
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -209,8 +180,7 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr);                            \
+    float* distances);                                               \
                                                                      \
   extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
     raft::resources const& handle,                                   \
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index a893153e1a..77c4bb8553 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -24,8 +24,6 @@
 #include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
-#include <rmm/mr/device/device_memory_resource.hpp>
-
 #include <memory>  // shared_ptr
 
 namespace raft::neighbors::ivf_pq {
@@ -403,38 +401,6 @@ void search_with_filtering(raft::resources const& handle,
   detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
 }
 
-/**
- * This function is deprecated and will be removed in a future.
- * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead.
- */
-template <typename T, typename IdxT, typename IvfSampleFilterT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search_with_filtering(raft::resources const& handle,
-                      const search_params& params,
-                      const index<IdxT>& idx,
-                      const T* queries,
-                      uint32_t n_queries,
-                      uint32_t k,
-                      IdxT* neighbors,
-                      float* distances,
-                      rmm::mr::device_memory_resource* mr,
-                      IvfSampleFilterT sample_filter = IvfSampleFilterT{})
-{
-  if (mr != nullptr) {
-    // Shallow copy of the resource with the automatic lifespan:
-    //                               change the workspace resource temporarily
-    raft::resources res_local(handle);
-    resource::set_workspace_resource(
-      res_local, std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}});
-    return search_with_filtering(
-      res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
-  } else {
-    return search_with_filtering(
-      handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
-  }
-}
-
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -446,16 +412,13 @@ search_with_filtering(raft::resources const& handle,
  * eliminate entirely allocations happening within `search`:
  * @code{.cpp}
  *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
  *   // use default search parameters
  *   ivf_pq::search_params search_params;
  *   // Use the same allocator across multiple searches to reduce the number of
  *   // cuda memory allocations
- *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1);
+ *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2);
+ *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3);
  *   ...
  * @endcode
  * The exact size of the temporary buffer depends on multiple factors and is an implementation
@@ -496,33 +459,4 @@ void search(raft::resources const& handle,
                                raft::neighbors::filtering::none_ivf_sample_filter{});
 }
 
-/**
- * This function is deprecated and will be removed in a future.
- * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead.
- */
-template <typename T, typename IdxT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search(raft::resources const& handle,
-       const search_params& params,
-       const index<IdxT>& idx,
-       const T* queries,
-       uint32_t n_queries,
-       uint32_t k,
-       IdxT* neighbors,
-       float* distances,
-       rmm::mr::device_memory_resource* mr)
-{
-  return search_with_filtering(handle,
-                               params,
-                               idx,
-                               queries,
-                               n_queries,
-                               k,
-                               neighbors,
-                               distances,
-                               mr,
-                               raft::neighbors::filtering::none_ivf_sample_filter{});
-}
-
 }  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 81e2886b18..3ee350c6fb 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -104,6 +104,36 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
+
+  /**
+   * Creates index_params based on shape of the input dataset.
+   * Usage example:
+   * @code{.cpp}
+   *   using namespace raft::neighbors;
+   *   raft::resources res;
+   *   // create index_params for a [N. D] dataset and have InnerProduct as the distance metric
+   *   auto dataset = raft::make_device_matrix<float, int64_t>(res, N, D);
+   *   ivf_pq::index_params index_params =
+   *     ivf_pq::index_params::from_dataset(dataset.view(), raft::distance::InnerProduct);
+   *   // modify/update index_params as needed
+   *   index_params.add_data_on_build = true;
+   * @endcode
+   */
+  template <typename DataT, typename Accessor>
+  static index_params from_dataset(
+    mdspan<const DataT, matrix_extent<int64_t>, row_major, Accessor> dataset,
+    raft::distance::DistanceType metric = raft::distance::L2Expanded)
+  {
+    index_params params;
+    params.n_lists =
+      dataset.extent(0) < 4 * 2500 ? 4 : static_cast<uint32_t>(std::sqrt(dataset.extent(0)));
+    params.pq_dim =
+      round_up_safe(static_cast<uint32_t>(dataset.extent(1) / 4), static_cast<uint32_t>(8));
+    params.pq_bits                  = 8;
+    params.kmeans_trainset_fraction = dataset.extent(0) < 10000 ? 1 : 0.1;
+    params.metric                   = metric;
+    return params;
+  }
 };
 
 struct search_params : ann::search_params {
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/brute_force.cuh b/cpp/include/raft/neighbors/specializations/brute_force.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations/brute_force.cuh
+++ b/cpp/include/raft/neighbors/specializations/brute_force.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
+++ b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
+++ b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
index ac3b80e8d9..c61d65dcaf 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/neighbors/specializations/refine.cuh
+++ b/cpp/include/raft/neighbors/specializations/refine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index e88cbbdeea..c33bb8c348 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -31,10 +31,10 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
-
-#include <stdio.h>
+#include <rmm/resource_ref.hpp>
 
 #include <cmath>
+#include <cstdio>
 #include <memory>
 #include <optional>
 #include <type_traits>
@@ -278,7 +278,7 @@ class multi_variable_gaussian_setup_token;
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
   raft::resources const& handle,
-  rmm::mr::device_memory_resource& mem_resource,
+  rmm::device_async_resource_ref mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method);
 
@@ -294,7 +294,7 @@ class multi_variable_gaussian_setup_token {
   template <typename T>
   friend multi_variable_gaussian_setup_token<T> build_multi_variable_gaussian_token_impl(
     raft::resources const& handle,
-    rmm::mr::device_memory_resource& mem_resource,
+    rmm::device_async_resource_ref mem_resource,
     const int dim,
     const multi_variable_gaussian_decomposition_method method);
 
@@ -321,7 +321,7 @@ class multi_variable_gaussian_setup_token {
   // Constructor, only for use by friend functions.
   // Hiding this will let us change the implementation in the future.
   multi_variable_gaussian_setup_token(raft::resources const& handle,
-                                      rmm::mr::device_memory_resource& mem_resource,
+                                      rmm::device_async_resource_ref mem_resource,
                                       const int dim,
                                       const multi_variable_gaussian_decomposition_method method)
     : impl_(std::make_unique<multi_variable_gaussian_impl<ValueType>>(
@@ -378,14 +378,14 @@ class multi_variable_gaussian_setup_token {
  private:
   std::unique_ptr<multi_variable_gaussian_impl<ValueType>> impl_;
   raft::resources const& handle_;
-  rmm::mr::device_memory_resource& mem_resource_;
+  rmm::device_async_resource_ref mem_resource_;
   int dim_ = 0;
 
   auto allocate_workspace() const
   {
     const auto num_elements = impl_->get_workspace_size();
     return rmm::device_uvector<ValueType>{
-      num_elements, resource::get_cuda_stream(handle_), &mem_resource_};
+      num_elements, resource::get_cuda_stream(handle_), mem_resource_};
   }
 
   int dim() const { return dim_; }
@@ -394,7 +394,7 @@ class multi_variable_gaussian_setup_token {
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
   raft::resources const& handle,
-  rmm::mr::device_memory_resource& mem_resource,
+  rmm::device_async_resource_ref mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method)
 {
@@ -414,7 +414,7 @@ void compute_multi_variable_gaussian_impl(
 template <typename ValueType>
 void compute_multi_variable_gaussian_impl(
   raft::resources const& handle,
-  rmm::mr::device_memory_resource& mem_resource,
+  rmm::device_async_resource_ref mem_resource,
   std::optional<raft::device_vector_view<const ValueType, int>> x,
   raft::device_matrix_view<ValueType, int, raft::col_major> P,
   raft::device_matrix_view<ValueType, int, raft::col_major> X,
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
index ab3f433422..4b37e1ff65 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -24,6 +24,8 @@
 #include <raft/core/resources.hpp>
 #include <raft/random/random_types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace raft::random {
 
 /**
@@ -33,7 +35,7 @@ namespace raft::random {
 
 template <typename ValueType>
 void multi_variable_gaussian(raft::resources const& handle,
-                             rmm::mr::device_memory_resource& mem_resource,
+                             rmm::device_async_resource_ref mem_resource,
                              std::optional<raft::device_vector_view<const ValueType, int>> x,
                              raft::device_matrix_view<ValueType, int, raft::col_major> P,
                              raft::device_matrix_view<ValueType, int, raft::col_major> X,
@@ -49,12 +51,8 @@ void multi_variable_gaussian(raft::resources const& handle,
                              raft::device_matrix_view<ValueType, int, raft::col_major> X,
                              const multi_variable_gaussian_decomposition_method method)
 {
-  rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource();
-  RAFT_EXPECTS(mem_resource_ptr != nullptr,
-               "compute_multi_variable_gaussian: "
-               "rmm::mr::get_current_device_resource() returned null; "
-               "please report this bug to the RAPIDS RAFT developers.");
-  detail::compute_multi_variable_gaussian_impl(handle, *mem_resource_ptr, x, P, X, method);
+  detail::compute_multi_variable_gaussian_impl(
+    handle, rmm::mr::get_current_device_resource(), x, P, X, method);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index b0315486ff..b1b0291a85 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -67,8 +67,8 @@ RAFT_KERNEL __launch_bounds__(calc_nnz_by_rows_tpb) calc_nnz_by_rows_kernel(cons
     index_t l_sum  = 0;
 
     while (offset < num_cols) {
-      index_t bitmap_idx = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      bitmap_t l_bitmap  = bitmap_t(0);
+      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
+      std::remove_const_t<bitmap_t> l_bitmap = 0;
 
       if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
 
@@ -176,9 +176,9 @@ RAFT_KERNEL __launch_bounds__(fill_indices_by_rows_tpb)
 
 #pragma unroll
     for (index_t offset = 0; offset < num_cols; offset += BITS_PER_BITMAP * warpSize) {
-      index_t bitmap_idx = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      bitmap_t l_bitmap  = bitmap_t(0);
-      index_t l_offset   = offset + lane_id * BITS_PER_BITMAP - (s_bit % BITS_PER_BITMAP);
+      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
+      std::remove_const_t<bitmap_t> l_bitmap = 0;
+      index_t l_offset = offset + lane_id * BITS_PER_BITMAP - (s_bit % BITS_PER_BITMAP);
 
       if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
index e271f2cdbe..8c267c5e63 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
@@ -236,8 +236,8 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
     return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(),
                                                       cache,
                                                       cache_size,
-                                                      cuco::sentinel::empty_key{value_idx{-1}},
-                                                      cuco::sentinel::empty_value{value_t{0}});
+                                                      cuco::empty_key{value_idx{-1}},
+                                                      cuco::empty_value{value_t{0}});
   }
 
   __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
@@ -247,10 +247,8 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
 
   __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
   {
-    return find_type(cache,
-                     cache_size,
-                     cuco::sentinel::empty_key{value_idx{-1}},
-                     cuco::sentinel::empty_value{value_t{0}});
+    return find_type(
+      cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}});
   }
 
   __device__ inline value_t find(find_type cache, const value_idx& key)
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index ed2b414c70..42b545180b 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,11 @@
 
 #pragma once
 
+#include <raft/core/math.hpp>
+#include <raft/distance/distance_types.hpp>
+
 #include <cub/cub.cuh>
+#include <cuda_pipeline.h>
 
 namespace raft {
 namespace sparse {
@@ -37,6 +41,127 @@ inline int max_cols_per_block()
          sizeof(value_t);
 }
 
+template <typename value_idx, typename value_t>
+RAFT_KERNEL faster_dot_on_csr_kernel(value_t* __restrict__ dot,
+                                     const value_idx* __restrict__ indptr,
+                                     const value_idx* __restrict__ cols,
+                                     const value_t* __restrict__ A,
+                                     const value_t* __restrict__ B,
+                                     const value_idx nnz,
+                                     const value_idx n_rows,
+                                     const value_idx dim)
+{
+  auto vec_id  = threadIdx.x;
+  auto lane_id = threadIdx.x & 0x1f;
+
+  extern __shared__ char smem[];
+  value_t* s_A      = (value_t*)smem;
+  value_idx cur_row = -1;
+
+  for (int row = blockIdx.x; row < n_rows; row += gridDim.x) {
+    for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) {
+      if (dot_id >= nnz) { return; }
+      const value_idx col               = cols[dot_id] * dim;
+      const value_t* __restrict__ B_col = B + col;
+
+      if (threadIdx.x == 0) { dot[dot_id] = 0.0; }
+      __syncthreads();
+
+      if (cur_row != row) {
+        for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+          s_A[k] = A[row * dim + k];
+        }
+        cur_row = row;
+      }
+
+      value_t l_dot_ = 0.0;
+      for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+        asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x));
+        l_dot_ += s_A[k] * __ldcg(B_col + k);
+      }
+      l_dot_ += __shfl_down_sync(0xffffffff, l_dot_, 16);
+      l_dot_ += __shfl_down_sync(0xffff, l_dot_, 8);
+      l_dot_ += __shfl_down_sync(0xff, l_dot_, 4);
+      l_dot_ += __shfl_down_sync(0xf, l_dot_, 2);
+      l_dot_ += __shfl_down_sync(0x3, l_dot_, 1);
+
+      if (lane_id == 0) { atomicAdd_block(dot + dot_id, l_dot_); }
+    }
+  }
+}
+
+template <typename value_idx, typename value_t>
+void faster_dot_on_csr(raft::resources const& handle,
+                       value_t* dot,
+                       const value_idx nnz,
+                       const value_idx* indptr,
+                       const value_idx* cols,
+                       const value_t* A,
+                       const value_t* B,
+                       const value_idx n_rows,
+                       const value_idx dim)
+{
+  if (nnz == 0 || n_rows == 0) return;
+
+  auto stream = resource::get_cuda_stream(handle);
+
+  constexpr value_idx MAX_ROW_PER_ITER = 500;
+  int dev_id, sm_count, blocks_per_sm;
+
+  const int smem_size = dim * sizeof(value_t);
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (dim < 128) {
+    constexpr int tpb = 64;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+
+  } else if (dim < 256) {
+    constexpr int tpb = 128;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else if (dim < 512) {
+    constexpr int tpb = 256;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else {
+    constexpr int tpb = 512;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  }
+
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace detail
 }  // namespace distance
 }  // namespace sparse
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 01ebfd04df..6ac0fc3b4b 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use raft/cluster/single_linkage_types.hpp instead.")
+#endif
 
 #include <raft/cluster/single_linkage_types.hpp>
 
@@ -31,4 +33,4 @@ using raft::cluster::linkage_output;
 using raft::cluster::linkage_output_int;
 using raft::cluster::linkage_output_int64;
 using raft::cluster::LinkageDistance;
-}  // namespace raft::hierarchy
\ No newline at end of file
+}  // namespace raft::hierarchy
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
index 7f990ff44b..d21b2a87a6 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,15 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/cluster version instead.")
+#endif
 
 #include <raft/cluster/single_linkage.cuh>
 #include <raft/sparse/hierarchy/common.h>
 
 namespace raft::hierarchy {
 using raft::cluster::single_linkage;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/linalg/spmm.cuh b/cpp/include/raft/sparse/linalg/spmm.cuh
index 439ed8c341..31d84d1b75 100644
--- a/cpp/include/raft/sparse/linalg/spmm.cuh
+++ b/cpp/include/raft/sparse/linalg/spmm.cuh
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the spmm.hpp at the same path instead.")
+#endif
 
 #include <raft/sparse/linalg/detail/spmm.hpp>
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..01625a0ce8
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#include <cuda_fp16.h>  // __half
+
+#include <cstdint>  // uint32_t
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::sparse::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted                   = false,
+              raft::matrix::SelectAlgo algo = raft::matrix::SelectAlgo::kAuto) RAFT_EXPLICIT;
+}  // namespace raft::sparse::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  extern template void raft::sparse::matrix::detail::select_k(        \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
new file mode 100644
index 0000000000..5f39affce6
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/detail/select_k-inl.cuh>
+#include <raft/matrix/select_k_types.hpp>
+
+#include <cub/cub.cuh>
+
+#include <type_traits>
+
+namespace raft::sparse::matrix::detail {
+
+using namespace raft::matrix::detail;
+using raft::matrix::SelectAlgo;
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
+ *
+ * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ * @tparam NZType
+ *   Type representing non-zero elements of `in_val`.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  auto csr_view = in_val.structure_view();
+  auto nnz      = csr_view.get_nnz();
+
+  if (nnz == 0) return;
+
+  auto batch_size = csr_view.get_n_rows();
+  auto len        = csr_view.get_n_cols();
+  auto k          = IdxT(out_val.extent(1));
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "sparse::matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
+                 "nnz of in_val must be equal to the length of in_idx");
+  }
+  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
+
+  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
+
+  auto indptr = csr_view.get_indptr().data();
+
+  switch (algo) {
+    case SelectAlgo::kRadix8bits:
+    case SelectAlgo::kRadix11bits:
+    case SelectAlgo::kRadix11bitsExtraPass: {
+      if (algo == SelectAlgo::kRadix8bits) {
+        select::radix::select_k<T, IdxT, 8, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          true,
+          indptr);
+      } else {
+        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
+        select::radix::select_k<T, IdxT, 11, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          fused_last_filter,
+          indptr);
+      }
+
+      if (sorted) {
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
+
+        auto keys =
+          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
+        auto vals =
+          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
+
+        segmented_sort_by_key<T, IdxT>(
+          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
+      }
+
+      return;
+    }
+    case SelectAlgo::kWarpDistributed:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpDistributedShm:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed_ext>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpAuto:
+      return select::warpsort::select_k<T, IdxT>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpImmediate:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_immediate>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpFiltered:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_filtered>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    default: RAFT_FAIL("K-selection Algorithm not supported.");
+  }
+
+  return;
+}
+
+}  // namespace raft::sparse::matrix::detail
diff --git a/cpp/include/raft/util/memory_pool.hpp b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
similarity index 76%
rename from cpp/include/raft/util/memory_pool.hpp
rename to cpp/include/raft/sparse/matrix/detail/select_k.cuh
index c9d25ecb1f..5d52b94b2f 100644
--- a/cpp/include/raft/util/memory_pool.hpp
+++ b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
-#include "memory_pool-ext.hpp"
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "select_k-inl.cuh"
+
+#endif
 
-#if !defined(RAFT_COMPILED)
-#include "memory_pool-inl.hpp"
-#endif  // RAFT_COMPILED
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
new file mode 100644
index 0000000000..3f97e60c99
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/sparse/matrix/detail/select_k.cuh>
+
+#include <optional>
+
+namespace raft::sparse::matrix {
+
+using SelectAlgo = raft::matrix::SelectAlgo;
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ * If the total number of values in a row is less than K, then the extra position in the
+ * corresponding row of out_val will maintain the original value. This applies to out_idx
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  return detail::select_k<T, IdxT>(
+    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
+}
+/** @} */  // end of group select_k
+
+}  // namespace raft::sparse::matrix
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index eb6de1c0a1..f8aeff23f9 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,15 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/sparse/solver version instead.")
+#endif
 
 #include <raft/sparse/mst/mst_solver.cuh>
 #include <raft/sparse/solver/mst.cuh>
 
 namespace raft::mst {
 using raft::sparse::solver::mst;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
index 5fbd264c6f..a0c1db5906 100644
--- a/cpp/include/raft/sparse/mst/mst.hpp
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/sparse/solver version instead.")
+#endif
 
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/mst/mst_solver.cuh>
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 76667396c3..d92d609be1 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/sparse/solver version instead.")
+#endif
 
 #include <raft/sparse/solver/mst_solver.cuh>
 
@@ -33,4 +35,4 @@ using raft::sparse::solver::Graph_COO;
 
 namespace raft::mst {
 using raft::sparse::solver::MST_solver;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/neighbors/knn.cuh b/cpp/include/raft/sparse/neighbors/knn.cuh
index 9dea2f5d52..2cf68818aa 100644
--- a/cpp/include/raft/sparse/neighbors/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/sparse/neighbors/brute_force.cuh>
diff --git a/cpp/include/raft/sparse/neighbors/specializations.cuh b/cpp/include/raft/sparse/neighbors/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/sparse/neighbors/specializations.cuh
+++ b/cpp/include/raft/sparse/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/sparse/selection/cross_component_nn.cuh b/cpp/include/raft/sparse/selection/cross_component_nn.cuh
index e115d6c061..2874f0bc5e 100644
--- a/cpp/include/raft/sparse/selection/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/selection/cross_component_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/sparse/neighbors/cross_component_nn.cuh>
 
@@ -34,4 +36,4 @@ namespace raft::linkage {
 using raft::sparse::neighbors::cross_component_nn;
 using raft::sparse::neighbors::FixConnectivitiesRedOp;
 using raft::sparse::neighbors::get_n_components;
-}  // namespace raft::linkage
\ No newline at end of file
+}  // namespace raft::linkage
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index 0258335941..e320d03478 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,12 +24,14 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/sparse/neighbors/knn.cuh>
 
 namespace raft::sparse::selection {
 using raft::sparse::neighbors::brute_force_knn;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 942213e6c1..e10bfe526f 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/sparse/neighbors/knn_graph.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 722c01d561..4bcc3328a6 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                              \
     __FILE__                                                  \
     " is deprecated and will be removed in a future release." \
     " Please use the other approximate KNN implementations defined in spatial/knn/*.")
+#endif
 
 #pragma once
 
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index f8c3fbd3c0..d08621030b 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_types.hpp>
diff --git a/cpp/include/raft/spatial/knn/ball_cover_types.hpp b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
index 31062ff364..5203d9afe6 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_types.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ball_cover_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 041ab225f9..351bcd5531 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -108,8 +108,15 @@ void approx_knn_search(raft::resources const& handle,
   if (index->ivf_flat<T, int64_t>()) {
     ivf_flat::search_params params;
     params.n_probes = index->nprobe;
-    ivf_flat::search(
-      handle, params, *(index->ivf_flat<T, int64_t>()), query_array, n, k, indices, distances);
+    ivf_flat::search(handle,
+                     params,
+                     *(index->ivf_flat<T, int64_t>()),
+                     query_array,
+                     n,
+                     k,
+                     indices,
+                     distances,
+                     resource::get_workspace_resource(handle));
   } else if (index->ivf_pq) {
     neighbors::ivf_pq::search_params params;
     params.n_probes = index->nprobe;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index d862e586e3..920249172f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_fp16.hpp>
 
@@ -416,7 +417,7 @@ struct batch_load_iterator {
           size_type row_width,
           size_type batch_size,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
       : stream_(stream),
         buf_(0, stream, mr),
         source_(source),
@@ -502,7 +503,7 @@ struct batch_load_iterator {
                       size_type row_width,
                       size_type batch_size,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
     : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0)
   {
   }
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
index d516743115..9fcb9323ab 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/epsilon_neighborhood.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index e63dcff475..6b968e9118 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_flat.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 9546e62be0..e882139187 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_flat_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh
index a89968bd80..ae4c0f914e 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_pq.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
index 168a75034f..dc97ab20a2 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/spatial/knn/specializations.cuh
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/spatial/knn/specializations/knn.cuh
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 30dd6e5e69..1fe078bd32 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -129,7 +129,7 @@ class vector_t {
  private:
   using thrust_exec_policy_t =
     thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>,
-                                           thrust::cuda_cub::execute_on_stream_base>;
+                                           thrust::cuda_cub::execute_on_stream_nosync_base>;
   rmm::device_uvector<value_type> buffer_;
   const thrust_exec_policy_t thrust_policy;
 };
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index 2a3b5cf36c..a4e504883a 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -19,6 +19,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/normalize.cuh>
 #include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.cuh>
@@ -101,8 +102,9 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
 
   // notice that at this point the matrix has already been transposed, so we are scaling
   // columns
-  scale_obs(nEigVecs, n, eigVecs);
-  RAFT_CHECK_CUDA(stream);
+  auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n);
+  raft::linalg::row_normalize(
+    handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
 
   // Find partition clustering
   auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 736936a1f1..002fad9680 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,85 +39,6 @@
 namespace raft {
 namespace spectral {
 
-template <typename index_type_t, typename value_type_t>
-RAFT_KERNEL scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
-{
-  index_type_t i, j, k, index, mm;
-  value_type_t alpha, v, last;
-  bool valid;
-  // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
-
-  // compute alpha
-  mm    = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x);  // m in multiple of blockDim.x
-  alpha = 0.0;
-
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
-    for (i = threadIdx.x; i < mm; i += blockDim.x) {
-      // check if the thread is valid
-      valid = i < m;
-
-      // get the value of the last thread
-      last = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
-
-      // if you are valid read the value from memory, otherwise set your value to 0
-      alpha = (valid) ? obs[i + j * m] : 0.0;
-      alpha = alpha * alpha;
-
-      // do prefix sum (of size warpSize=blockDim.x =< 32)
-      for (k = 1; k < blockDim.x; k *= 2) {
-        v = __shfl_up_sync(warp_full_mask(), alpha, k, blockDim.x);
-        if (threadIdx.x >= k) alpha += v;
-      }
-      // shift by last
-      alpha += last;
-    }
-  }
-
-  // scale by alpha
-  alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
-  alpha = raft::sqrt(alpha);
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
-    for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
-      index      = i + j * m;
-      obs[index] = obs[index] / alpha;
-    }
-  }
-}
-
-template <typename index_type_t>
-index_type_t next_pow2(index_type_t n)
-{
-  index_type_t v;
-  // Reference:
-  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
-  v = n - 1;
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return v + 1;
-}
-
-template <typename index_type_t, typename value_type_t>
-cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
-{
-  index_type_t p2m;
-
-  // find next power of 2
-  p2m = next_pow2<index_type_t>(m);
-  // setup launch configuration
-  unsigned int xsize = std::max(2, std::min(p2m, 32));
-  dim3 nthreads{xsize, 256 / xsize, 1};
-
-  dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
-
-  // launch scaling kernel (scale each column of obs by its norm)
-  scale_obs_kernel<index_type_t, value_type_t><<<nblocks, nthreads>>>(m, n, obs);
-
-  return cudaSuccess;
-}
-
 template <typename vertex_t, typename edge_t, typename weight_t>
 void transform_eigen_matrix(raft::resources const& handle,
                             edge_t n,
diff --git a/cpp/include/raft/spectral/specializations.cuh b/cpp/include/raft/spectral/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/spectral/specializations.cuh
+++ b/cpp/include/raft/spectral/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/stats/specializations.cuh
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index e5ce15e8a3..2b334d1bbf 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -18,7 +18,6 @@
 
 #include <raft/core/error.hpp>
 #include <raft/util/cuda_rt_essentials.hpp>
-#include <raft/util/memory_pool.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
deleted file mode 100644
index bd7e0186b3..0000000000
--- a/cpp/include/raft/util/memory_pool-inl.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
-
-#include <rmm/aligned.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <cstddef>
-#include <memory>
-
-namespace raft {
-
-/**
- * @defgroup memory_pool Memory Pool
- * @{
- */
-/**
- * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
- * unique pointer.
- *
- * This function is useful in the code where multiple repeated allocations/deallocations are
- * expected.
- * Use case example:
- * @code{.cpp}
- *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
- *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
- *     if (pool_guard){
- *       RAFT_LOG_INFO("Created a pool");
- *     } else {
- *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
- *     }
- *     rmm::device_uvector<float> x(n, stream, mr);
- *     rmm::device_uvector<float> y(n, stream, mr);
- *     ...
- *   }
- * @endcode
- * Here, the new memory resource would be created within the function scope if the passed `mr` is
- * null and the default resource is not a pool. After the call, `mr` contains a valid memory
- * resource in any case.
- *
- * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
- * into a `pool_memory_resource` if necessary and return the pointer to the result.
- * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
- * to 256 bytes).
- *
- * @return if a new memory pool is created, it returns a unique_ptr to it;
- *   this managed pointer controls the lifetime of the created memory resource.
- */
-RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
-  rmm::mr::device_memory_resource*& mr, size_t initial_size)
-{
-  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  std::unique_ptr<pool_res_t> pool_res{nullptr};
-  if (mr) return pool_res;
-  mr = rmm::mr::get_current_device_resource();
-  if (!dynamic_cast<pool_res_t*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
-    pool_res = std::make_unique<pool_res_t>(
-      mr, rmm::align_down(initial_size, rmm::CUDA_ALLOCATION_ALIGNMENT));
-    mr = pool_res.get();
-  }
-  return pool_res;
-}
-
-/** @} */
-}  // namespace raft
diff --git a/cpp/include/raft/util/pow2_utils.cuh b/cpp/include/raft/util/pow2_utils.cuh
index 68b35837b6..0c740ac5f6 100644
--- a/cpp/include/raft/util/pow2_utils.cuh
+++ b/cpp/include/raft/util/pow2_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,15 @@
 
 namespace raft {
 
+/**
+ * Checks whether an integer is a power of 2.
+ */
+template <typename T>
+constexpr HDI std::enable_if_t<std::is_integral_v<T>, bool> is_pow2(T v)
+{
+  return (v && !(v & (v - 1)));
+}
+
 /**
  * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time.
  *
@@ -33,7 +42,7 @@ struct Pow2 {
   static constexpr Type Mask  = Value - 1;
 
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
-  static_assert(Value && !(Value & Mask), "Value must be power of two.");
+  static_assert(is_pow2(Value), "Value must be power of two.");
 
 #define Pow2_FUNC_QUALIFIER         static constexpr __host__ __device__ __forceinline__
 #define Pow2_WHEN_INTEGRAL(I)       std::enable_if_t<Pow2_IS_REPRESENTABLE_AS(I), I>
diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh
index 2c2b1aa228..c0d3da7609 100644
--- a/cpp/include/raft/util/reduction.cuh
+++ b/cpp/include/raft/util/reduction.cuh
@@ -39,8 +39,8 @@ DI T logicalWarpReduce(T val, ReduceLambda reduce_op)
 {
 #pragma unroll
   for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
-    T tmp = shfl_xor(val, i);
-    val   = reduce_op(val, tmp);
+    const T tmp = shfl_xor(val, i, logicalWarpSize);
+    val         = reduce_op(val, tmp);
   }
   return val;
 }
@@ -197,4 +197,104 @@ DI i_t binaryBlockReduce(i_t val, i_t* shmem)
   }
 }
 
+/**
+ * @brief Executes a collaborative vector reduction per sub-warp
+ *
+ * This uses fewer shuffles than naively reducing each element independently.
+ * Better performance is achieved with a larger vector width, up to vecWidth == warpSize/2.
+ * For example, for logicalWarpSize == 32 and vecWidth == 16, the naive method requires 80
+ * shuffles, this one only 31, 2.58x fewer.
+ *
+ * However, the output of the reduction is not broadcasted. The vector is modified in place and
+ * each thread holds a part of the output vector. The outputs are distributed in a round-robin
+ * pattern between the threads to facilitate coalesced IO. There are 2 possible layouts based on
+ * which of logicalWarpSize and vecWidth is larger:
+ * - If vecWidth >= logicalWarpSize, each thread has vecWidth/logicalWarpSize outputs.
+ * - If logicalWarpSize > vecWidth, logicalWarpSize/vecWidth threads have a copy of the same output.
+ *
+ * Example 1: logicalWarpSize == 4, vecWidth == 8, v = a+b+c+d
+ *           IN                        OUT
+ *  lane 0 | a0 a1 a2 a3 a4 a5 a6 a7 | v0 v4 - - - - - -
+ *  lane 1 | b0 b1 b2 b3 b4 b5 b6 b7 | v1 v5 - - - - - -
+ *  lane 2 | c0 c1 c2 c3 c4 c5 c6 c7 | v2 v6 - - - - - -
+ *  lane 3 | d0 d1 d2 d3 d4 d5 d6 d7 | v3 v7 - - - - - -
+ *
+ * Example 2: logicalWarpSize == 8, vecWidth == 4, v = a+b+c+d+e+f+g+h
+ *           IN            OUT
+ *  lane 0 | a0 a1 a2 a3 | v0 - - -
+ *  lane 1 | b0 b1 b2 b3 | v0 - - -
+ *  lane 2 | c0 c1 c2 c3 | v1 - - -
+ *  lane 3 | d0 d1 d2 d3 | v1 - - -
+ *  lane 4 | e0 e1 e2 e3 | v2 - - -
+ *  lane 5 | f0 f1 f2 f3 | v2 - - -
+ *  lane 6 | g0 g1 g2 g3 | v3 - - -
+ *  lane 7 | h0 h1 h2 h3 | v3 - - -
+ *
+ * @tparam logicalWarpSize Sub-warp size. Must be 2, 4, 8, 16 or 32.
+ * @tparam vecWidth Vector width. Must be a power of two.
+ * @tparam T Vector element type.
+ * @tparam ReduceLambda Reduction operator type.
+ * @param[in,out] acc Pointer to a vector of size vecWidth or more in registers
+ * @param[in] lane_id Lane id between 0 and logicalWarpSize-1
+ * @param[in] reduce_op Reduction operator, assumed to be commutative and associative.
+ */
+template <int logicalWarpSize, int vecWidth, typename T, typename ReduceLambda>
+DI void logicalWarpReduceVector(T* acc, int lane_id, ReduceLambda reduce_op)
+{
+  static_assert(vecWidth > 0, "Vec width must be strictly positive.");
+  static_assert(!(vecWidth & (vecWidth - 1)), "Vec width must be a power of two.");
+  static_assert(logicalWarpSize >= 2 && logicalWarpSize <= 32,
+                "Logical warp size must be between 2 and 32");
+  static_assert(!(logicalWarpSize & (logicalWarpSize - 1)),
+                "Logical warp size must be a power of two.");
+
+  constexpr int shflStride   = logicalWarpSize / 2;
+  constexpr int nextWarpSize = logicalWarpSize / 2;
+
+  // One step of the butterfly reduction, applied to each element of the vector.
+#pragma unroll
+  for (int k = 0; k < vecWidth; k++) {
+    const T tmp = shfl_xor(acc[k], shflStride, logicalWarpSize);
+    acc[k]      = reduce_op(acc[k], tmp);
+  }
+
+  constexpr int nextVecWidth = std::max(1, vecWidth / 2);
+
+  /* Split into 2 smaller logical warps and distribute half of the data to each for the next step.
+   * The distribution pattern is designed so that at the end the outputs are coalesced/round-robin.
+   * The idea is to distribute contiguous "chunks" of the vectors based on the new warp size. These
+   * chunks will be halved in the next step and so on.
+   *
+   * Example for logicalWarpSize == 4, vecWidth == 8:
+   *  lane 0 | 0 1 2 3 4 5 6 7 | [0 1] [4 5] - - - - | [0] [4] - - - - - -
+   *  lane 1 | 0 1 2 3 4 5 6 7 | [0 1] [4 5] - - - - | [1] [5] - - - - - -
+   *  lane 2 | 0 1 2 3 4 5 6 7 | [2 3] [6 7] - - - - | [2] [6] - - - - - -
+   *  lane 3 | 0 1 2 3 4 5 6 7 | [2 3] [6 7] - - - - | [3] [7] - - - - - -
+   *                      chunkSize=2           chunkSize=1
+   */
+  if constexpr (nextVecWidth < vecWidth) {
+    T tmp[nextVecWidth];
+    const bool firstHalf    = (lane_id % logicalWarpSize) < nextWarpSize;
+    constexpr int chunkSize = std::min(nextVecWidth, nextWarpSize);
+    constexpr int numChunks = nextVecWidth / chunkSize;
+#pragma unroll
+    for (int c = 0; c < numChunks; c++) {
+#pragma unroll
+      for (int i = 0; i < chunkSize; i++) {
+        const int k = c * chunkSize + i;
+        tmp[k]      = firstHalf ? acc[2 * c * chunkSize + i] : acc[(2 * c + 1) * chunkSize + i];
+      }
+    }
+#pragma unroll
+    for (int k = 0; k < nextVecWidth; k++) {
+      acc[k] = tmp[k];
+    }
+  }
+
+  // Recursively call with smaller sub-warps and possibly smaller vector width.
+  if constexpr (nextWarpSize > 1) {
+    logicalWarpReduceVector<nextWarpSize, nextVecWidth>(acc, lane_id % nextWarpSize, reduce_op);
+  }
+}
+
 }  // namespace raft
diff --git a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
index 7a65e2d2f8..1e6f4f9976 100644
--- a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
+++ b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
@@ -25,6 +25,8 @@
 
 #include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cstdint>  // int64_t
 
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
@@ -44,8 +46,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr);                            \
+    float* distances);                                               \
                                                                      \
   extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
     raft::resources const& handle,                                   \
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 79206c7a43..c14a8e3e9f 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -23,9 +23,7 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace raft::neighbors {
 
diff --git a/cpp/scripts/__clang_cuda_additional_intrinsics.h b/cpp/scripts/__clang_cuda_additional_intrinsics.h
index b9c032dc45..8b1335e5d1 100644
--- a/cpp/scripts/__clang_cuda_additional_intrinsics.h
+++ b/cpp/scripts/__clang_cuda_additional_intrinsics.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022, NVIDIA CORPORATION.
+// Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #ifndef __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
 #define __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
 #ifndef __CUDA__
@@ -233,7 +233,7 @@ __MAKE_LD4(cv, float4, float, "f32", "f", : "memory")
   }
 
 #define __MAKE_ST4(cop, c_typ, int_typ, ptx_typ, inl_typ)                       \
-  __device__ __forceinline__ c_typ __st##cop(c_typ* addr, c_typ v)              \
+  __device__ __forceinline__ void __st##cop(c_typ* addr, c_typ v)               \
   {                                                                             \
     int_typ v1 = v.x, v2 = v.y, v3 = v.z, v4 = v.w;                             \
     asm("st." #cop ".v4." ptx_typ " [%0], {%1, %2, %3, %4};" ::__LDG_PTR(addr), \
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index 179bf8f20f..542fdaad1f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -44,6 +44,7 @@ namespace raft::neighbors::cagra::detail::multi_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 #define COMMA ,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 7fb705a2d2..855b104670 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -45,6 +45,7 @@ namespace raft::neighbors::cagra::detail::single_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 #define COMMA ,
diff --git a/cpp/src/neighbors/detail/ivf_flat_search.cu b/cpp/src/neighbors/detail/ivf_flat_search.cu
index 9d39607750..336bea19b6 100644
--- a/cpp/src/neighbors/detail/ivf_flat_search.cu
+++ b/cpp/src/neighbors/detail/ivf_flat_search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/detail/ivf_flat_search-inl.cuh>
 #include <raft/neighbors/sample_filter_types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT, IvfSampleFilterT)  \
   template void raft::neighbors::ivf_flat::detail::search<T, IdxT, IvfSampleFilterT>( \
     raft::resources const& handle,                                                    \
@@ -27,7 +29,7 @@
     uint32_t k,                                                                       \
     IdxT* neighbors,                                                                  \
     float* distances,                                                                 \
-    rmm::mr::device_memory_resource* mr,                                              \
+    rmm::device_async_resource_ref mr,                                                \
     IvfSampleFilterT sample_filter)
 
 instantiate_raft_neighbors_ivf_flat_detail_search(
diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py
index d987a4e17d..7b55cad4de 100644
--- a/cpp/src/neighbors/ivf_flat_00_generate.py
+++ b/cpp/src/neighbors/ivf_flat_00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 header = """/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,8 +127,8 @@
 
 search_macro = """
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)        \\
-  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
-    raft::resources const& handle,                          \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(        \\
+    raft::resources const& handle,                                 \\
     const raft::neighbors::ivf_flat::search_params& params,        \\
     const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
     const T* queries,                                              \\
@@ -136,10 +136,10 @@
     uint32_t k,                                                    \\
     IdxT* neighbors,                                               \\
     float* distances,                                              \\
-    rmm::mr::device_memory_resource* mr );                         \\
+    rmm::device_async_resource_ref mr);                            \\
                                                                    \\
-  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
-    raft::resources const& handle,                          \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(        \\
+    raft::resources const& handle,                                 \\
     const raft::neighbors::ivf_flat::search_params& params,        \\
     const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
     raft::device_matrix_view<const T, IdxT, row_major> queries,    \\
diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
index 03dcfee817..e5cfe14e3f 100644
--- a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
@@ -35,7 +37,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr);                       \
+    rmm::device_async_resource_ref mr);                         \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
index 7646081183..35792a78a8 100644
--- a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
@@ -35,7 +37,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr);                       \
+    rmm::device_async_resource_ref mr);                         \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
index 5d2effd385..663e52cb99 100644
--- a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
@@ -35,7 +37,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr);                       \
+    rmm::device_async_resource_ref mr);                         \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
diff --git a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
index e56c107735..2d15167099 100644
--- a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
   template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
     raft::resources const& handle,                                   \
@@ -34,8 +36,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
 
diff --git a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu
index c9f2e6fdd5..c9a380e21f 100644
--- a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda_fp16.h>
 
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
@@ -36,8 +38,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(half, int64_t);
 
diff --git a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
index 1efe4f7fb2..e85c98d8dd 100644
--- a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
   template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
     raft::resources const& handle,                                   \
@@ -34,8 +36,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
 
diff --git a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
index e746391443..42653254e9 100644
--- a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
   template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
     raft::resources const& handle,                                   \
@@ -34,8 +36,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t);
 
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index aadfdb0028..11d2403963 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.04")
+set(RAPIDS_VERSION "24.06")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 20ed3bacc7..ff0518a4d0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -99,12 +99,12 @@ if(BUILD_TESTS)
     NAME
     CLUSTER_TEST
     PATH
-    test/cluster/kmeans.cu
-    test/cluster/kmeans_balanced.cu
-    test/cluster/kmeans_find_k.cu
-    test/cluster/cluster_solvers.cu
-    test/cluster/linkage.cu
-    test/cluster/spectral.cu
+    cluster/kmeans.cu
+    cluster/kmeans_balanced.cu
+    cluster/kmeans_find_k.cu
+    cluster/cluster_solvers.cu
+    cluster/linkage.cu
+    cluster/spectral.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -113,37 +113,37 @@ if(BUILD_TESTS)
     NAME
     CORE_TEST
     PATH
-    test/core/bitset.cu
-    test/core/device_resources_manager.cpp
-    test/core/device_setter.cpp
-    test/core/logger.cpp
-    test/core/math_device.cu
-    test/core/math_host.cpp
-    test/core/operators_device.cu
-    test/core/operators_host.cpp
-    test/core/handle.cpp
-    test/core/interruptible.cu
-    test/core/nvtx.cpp
-    test/core/mdarray.cu
-    test/core/mdbuffer.cu
-    test/core/mdspan_copy.cpp
-    test/core/mdspan_copy.cu
-    test/core/mdspan_utils.cu
-    test/core/numpy_serializer.cu
-    test/core/memory_type.cpp
-    test/core/sparse_matrix.cu
-    test/core/sparse_matrix.cpp
-    test/core/span.cpp
-    test/core/span.cu
-    test/core/stream_view.cpp
-    test/core/temporary_device_buffer.cu
-    test/test.cpp
+    core/bitset.cu
+    core/device_resources_manager.cpp
+    core/device_setter.cpp
+    core/logger.cpp
+    core/math_device.cu
+    core/math_host.cpp
+    core/operators_device.cu
+    core/operators_host.cpp
+    core/handle.cpp
+    core/interruptible.cu
+    core/nvtx.cpp
+    core/mdarray.cu
+    core/mdbuffer.cu
+    core/mdspan_copy.cpp
+    core/mdspan_copy.cu
+    core/mdspan_utils.cu
+    core/numpy_serializer.cu
+    core/memory_type.cpp
+    core/sparse_matrix.cu
+    core/sparse_matrix.cpp
+    core/span.cpp
+    core/span.cu
+    core/stream_view.cpp
+    core/temporary_device_buffer.cu
+    test.cpp
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
-    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+    NAME CORE_TEST PATH core/stream_view.cpp core/mdspan_copy.cpp LIB
     EXPLICIT_INSTANTIATE_ONLY NOCUDA
   )
 
@@ -151,28 +151,28 @@ if(BUILD_TESTS)
     NAME
     DISTANCE_TEST
     PATH
-    test/distance/dist_adj.cu
-    test/distance/dist_adj_distance_instance.cu
-    test/distance/dist_canberra.cu
-    test/distance/dist_correlation.cu
-    test/distance/dist_cos.cu
-    test/distance/dist_hamming.cu
-    test/distance/dist_hellinger.cu
-    test/distance/dist_inner_product.cu
-    test/distance/dist_jensen_shannon.cu
-    test/distance/dist_kl_divergence.cu
-    test/distance/dist_l1.cu
-    test/distance/dist_l2_exp.cu
-    test/distance/dist_l2_unexp.cu
-    test/distance/dist_l2_sqrt_exp.cu
-    test/distance/dist_l_inf.cu
-    test/distance/dist_lp_unexp.cu
-    test/distance/dist_russell_rao.cu
-    test/distance/masked_nn.cu
-    test/distance/masked_nn_compress_to_bits.cu
-    test/distance/fused_l2_nn.cu
-    test/distance/fused_cosine_nn.cu
-    test/distance/gram.cu
+    distance/dist_adj.cu
+    distance/dist_adj_distance_instance.cu
+    distance/dist_canberra.cu
+    distance/dist_correlation.cu
+    distance/dist_cos.cu
+    distance/dist_hamming.cu
+    distance/dist_hellinger.cu
+    distance/dist_inner_product.cu
+    distance/dist_jensen_shannon.cu
+    distance/dist_kl_divergence.cu
+    distance/dist_l1.cu
+    distance/dist_l2_exp.cu
+    distance/dist_l2_unexp.cu
+    distance/dist_l2_sqrt_exp.cu
+    distance/dist_l_inf.cu
+    distance/dist_lp_unexp.cu
+    distance/dist_russell_rao.cu
+    distance/masked_nn.cu
+    distance/masked_nn_compress_to_bits.cu
+    distance/fused_l2_nn.cu
+    distance/fused_cosine_nn.cu
+    distance/gram.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -180,23 +180,23 @@ if(BUILD_TESTS)
   list(
     APPEND
     EXT_HEADER_TEST_SOURCES
-    test/ext_headers/raft_neighbors_brute_force.cu
-    test/ext_headers/raft_distance_distance.cu
-    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
-    test/ext_headers/raft_matrix_detail_select_k.cu
-    test/ext_headers/raft_neighbors_ball_cover.cu
-    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
-    test/ext_headers/raft_distance_fused_l2_nn.cu
-    test/ext_headers/raft_neighbors_ivf_pq.cu
-    test/ext_headers/raft_util_memory_pool.cpp
-    test/ext_headers/raft_neighbors_ivf_flat.cu
-    test/ext_headers/raft_core_logger.cpp
-    test/ext_headers/raft_neighbors_refine.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
-    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
-    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
-    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+    ext_headers/raft_neighbors_brute_force.cu
+    ext_headers/raft_distance_distance.cu
+    ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+    ext_headers/raft_matrix_detail_select_k.cu
+    ext_headers/raft_neighbors_ball_cover.cu
+    ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+    ext_headers/raft_distance_fused_l2_nn.cu
+    ext_headers/raft_neighbors_ivf_pq.cu
+    ext_headers/raft_neighbors_ivf_flat.cu
+    ext_headers/raft_core_logger.cpp
+    ext_headers/raft_neighbors_refine.cu
+    ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+    ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    ext_headers/raft_sparse_matrix_detail_select_k.cu
+    ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+    ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+    ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
   )
 
   # Test that the split headers compile in isolation with:
@@ -211,133 +211,134 @@ if(BUILD_TESTS)
   ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
   ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
 
-  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
+  ConfigureTest(NAME LABEL_TEST PATH label/label.cu label/merge_labels.cu)
 
   ConfigureTest(
     NAME
     LINALG_TEST
     PATH
-    test/linalg/add.cu
-    test/linalg/axpy.cu
-    test/linalg/binary_op.cu
-    test/linalg/cholesky_r1.cu
-    test/linalg/coalesced_reduction.cu
-    test/linalg/divide.cu
-    test/linalg/dot.cu
-    test/linalg/eig.cu
-    test/linalg/eig_sel.cu
-    test/linalg/gemm_layout.cu
-    test/linalg/gemv.cu
-    test/linalg/map.cu
-    test/linalg/map_then_reduce.cu
-    test/linalg/matrix_vector.cu
-    test/linalg/matrix_vector_op.cu
-    test/linalg/mean_squared_error.cu
-    test/linalg/multiply.cu
-    test/linalg/norm.cu
-    test/linalg/normalize.cu
-    test/linalg/power.cu
-    test/linalg/randomized_svd.cu
-    test/linalg/reduce.cu
-    test/linalg/reduce_cols_by_key.cu
-    test/linalg/reduce_rows_by_key.cu
-    test/linalg/rsvd.cu
-    test/linalg/sqrt.cu
-    test/linalg/strided_reduction.cu
-    test/linalg/subtract.cu
-    test/linalg/svd.cu
-    test/linalg/ternary_op.cu
-    test/linalg/transpose.cu
-    test/linalg/unary_op.cu
+    linalg/add.cu
+    linalg/axpy.cu
+    linalg/binary_op.cu
+    linalg/cholesky_r1.cu
+    linalg/coalesced_reduction.cu
+    linalg/divide.cu
+    linalg/dot.cu
+    linalg/eig.cu
+    linalg/eig_sel.cu
+    linalg/gemm_layout.cu
+    linalg/gemv.cu
+    linalg/map.cu
+    linalg/map_then_reduce.cu
+    linalg/matrix_vector.cu
+    linalg/matrix_vector_op.cu
+    linalg/mean_squared_error.cu
+    linalg/multiply.cu
+    linalg/norm.cu
+    linalg/normalize.cu
+    linalg/power.cu
+    linalg/randomized_svd.cu
+    linalg/reduce.cu
+    linalg/reduce_cols_by_key.cu
+    linalg/reduce_rows_by_key.cu
+    linalg/rsvd.cu
+    linalg/sqrt.cu
+    linalg/strided_reduction.cu
+    linalg/subtract.cu
+    linalg/svd.cu
+    linalg/ternary_op.cu
+    linalg/transpose.cu
+    linalg/unary_op.cu
   )
 
   ConfigureTest(
     NAME
     MATRIX_TEST
     PATH
-    test/matrix/argmax.cu
-    test/matrix/argmin.cu
-    test/matrix/columnSort.cu
-    test/matrix/diagonal.cu
-    test/matrix/gather.cu
-    test/matrix/scatter.cu
-    test/matrix/eye.cu
-    test/matrix/linewise_op.cu
-    test/matrix/math.cu
-    test/matrix/matrix.cu
-    test/matrix/norm.cu
-    test/matrix/reverse.cu
-    test/matrix/sample_rows.cu
-    test/matrix/slice.cu
-    test/matrix/triangular.cu
-    test/sparse/spectral_matrix.cu
+    matrix/argmax.cu
+    matrix/argmin.cu
+    matrix/columnSort.cu
+    matrix/diagonal.cu
+    matrix/gather.cu
+    matrix/scatter.cu
+    matrix/eye.cu
+    matrix/linewise_op.cu
+    matrix/math.cu
+    matrix/matrix.cu
+    matrix/norm.cu
+    matrix/reverse.cu
+    matrix/sample_rows.cu
+    matrix/slice.cu
+    matrix/triangular.cu
+    sparse/spectral_matrix.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
-  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+  ConfigureTest(NAME MATRIX_SELECT_TEST PATH matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
 
   ConfigureTest(
-    NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_SELECT_LARGE_TEST PATH matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
     NAME
     RANDOM_TEST
     PATH
-    test/random/make_blobs.cu
-    test/random/make_regression.cu
-    test/random/multi_variable_gaussian.cu
-    test/random/rng_pcg_host_api.cu
-    test/random/permute.cu
-    test/random/rng.cu
-    test/random/rng_discrete.cu
-    test/random/rng_int.cu
-    test/random/rmat_rectangular_generator.cu
-    test/random/sample_without_replacement.cu
-    test/random/excess_sampling.cu
+    random/make_blobs.cu
+    random/make_regression.cu
+    random/multi_variable_gaussian.cu
+    random/rng_pcg_host_api.cu
+    random/permute.cu
+    random/rng.cu
+    random/rng_discrete.cu
+    random/rng_int.cu
+    random/rmat_rectangular_generator.cu
+    random/sample_without_replacement.cu
+    random/excess_sampling.cu
   )
 
   ConfigureTest(
-    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-    test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME SOLVERS_TEST PATH cluster/cluster_solvers_deprecated.cu linalg/eigen_solvers.cu
+    lap/lap.cu sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
     NAME
     SPARSE_TEST
     PATH
-    test/sparse/add.cu
-    test/sparse/convert_coo.cu
-    test/sparse/convert_csr.cu
-    test/sparse/csr_row_slice.cu
-    test/sparse/csr_to_dense.cu
-    test/sparse/csr_transpose.cu
-    test/sparse/degree.cu
-    test/sparse/filter.cu
-    test/sparse/norm.cu
-    test/sparse/normalize.cu
-    test/sparse/reduce.cu
-    test/sparse/row_op.cu
-    test/sparse/sddmm.cu
-    test/sparse/sort.cu
-    test/sparse/spgemmi.cu
-    test/sparse/spmm.cu
-    test/sparse/symmetrize.cu
+    sparse/add.cu
+    sparse/convert_coo.cu
+    sparse/convert_csr.cu
+    sparse/csr_row_slice.cu
+    sparse/csr_to_dense.cu
+    sparse/csr_transpose.cu
+    sparse/degree.cu
+    sparse/filter.cu
+    sparse/norm.cu
+    sparse/normalize.cu
+    sparse/reduce.cu
+    sparse/row_op.cu
+    sparse/sddmm.cu
+    sparse/select_k_csr.cu
+    sparse/sort.cu
+    sparse/spgemmi.cu
+    sparse/spmm.cu
+    sparse/symmetrize.cu
   )
 
   ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
-    test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME SPARSE_DIST_TEST PATH sparse/dist_coo_spmv.cu sparse/distance.cu
+    sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
     NAME
     SPARSE_NEIGHBORS_TEST
     PATH
-    test/sparse/neighbors/cross_component_nn.cu
-    test/sparse/neighbors/brute_force.cu
-    test/sparse/neighbors/knn_graph.cu
+    sparse/neighbors/cross_component_nn.cu
+    sparse/neighbors/brute_force.cu
+    sparse/neighbors/knn_graph.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -346,19 +347,19 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_TEST
     PATH
-    test/neighbors/knn.cu
-    test/neighbors/fused_l2_knn.cu
-    test/neighbors/tiled_knn.cu
-    test/neighbors/haversine.cu
-    test/neighbors/ball_cover.cu
-    test/neighbors/epsilon_neighborhood.cu
-    test/neighbors/refine.cu
+    neighbors/knn.cu
+    neighbors/fused_l2_knn.cu
+    neighbors/tiled_knn.cu
+    neighbors/haversine.cu
+    neighbors/ball_cover.cu
+    neighbors/epsilon_neighborhood.cu
+    neighbors/refine.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
-    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
+    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH neighbors/ann_brute_force/test_float.cu LIB
     EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
   )
 
@@ -366,30 +367,30 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_CAGRA_TEST
     PATH
-    test/neighbors/ann_cagra/test_float_uint32_t.cu
-    test/neighbors/ann_cagra/test_half_uint32_t.cu
-    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_float_int64_t.cu
-    test/neighbors/ann_cagra/test_half_int64_t.cu
-    test/neighbors/ann_cagra_vpq/test_float_int64_t.cu
-    test/neighbors/ann_cagra_vpq/test_float_uint32_t.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+    neighbors/ann_cagra/test_float_uint32_t.cu
+    neighbors/ann_cagra/test_half_uint32_t.cu
+    neighbors/ann_cagra/test_int8_t_uint32_t.cu
+    neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+    neighbors/ann_cagra/test_float_int64_t.cu
+    neighbors/ann_cagra/test_half_int64_t.cu
+    neighbors/ann_cagra_vpq/test_float_int64_t.cu
+    neighbors/ann_cagra_vpq/test_float_uint32_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
@@ -402,40 +403,40 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_IVF_TEST
     PATH
-    test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
-    test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
-    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
-    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+    neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
+    neighbors/ann_ivf_flat/test_float_int64_t.cu
+    neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+    neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+    neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
+    neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
+    neighbors/ann_ivf_pq/test_float_uint32_t.cu
+    neighbors/ann_ivf_pq/test_float_int64_t.cu
+    neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+    neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+    neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+    neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
@@ -448,9 +449,9 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_NN_DESCENT_TEST
     PATH
-    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    neighbors/ann_nn_descent/test_float_uint32_t.cu
+    neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+    neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
@@ -463,32 +464,32 @@ if(BUILD_TESTS)
     NAME
     STATS_TEST
     PATH
-    test/stats/accuracy.cu
-    test/stats/adjusted_rand_index.cu
-    test/stats/completeness_score.cu
-    test/stats/contingencyMatrix.cu
-    test/stats/cov.cu
-    test/stats/dispersion.cu
-    test/stats/entropy.cu
-    test/stats/histogram.cu
-    test/stats/homogeneity_score.cu
-    test/stats/information_criterion.cu
-    test/stats/kl_divergence.cu
-    test/stats/mean.cu
-    test/stats/meanvar.cu
-    test/stats/mean_center.cu
-    test/stats/minmax.cu
-    test/stats/mutual_info_score.cu
-    test/stats/neighborhood_recall.cu
-    test/stats/r2_score.cu
-    test/stats/rand_index.cu
-    test/stats/regression_metrics.cu
-    test/stats/silhouette_score.cu
-    test/stats/stddev.cu
-    test/stats/sum.cu
-    test/stats/trustworthiness.cu
-    test/stats/weighted_mean.cu
-    test/stats/v_measure.cu
+    stats/accuracy.cu
+    stats/adjusted_rand_index.cu
+    stats/completeness_score.cu
+    stats/contingencyMatrix.cu
+    stats/cov.cu
+    stats/dispersion.cu
+    stats/entropy.cu
+    stats/histogram.cu
+    stats/homogeneity_score.cu
+    stats/information_criterion.cu
+    stats/kl_divergence.cu
+    stats/mean.cu
+    stats/meanvar.cu
+    stats/mean_center.cu
+    stats/minmax.cu
+    stats/mutual_info_score.cu
+    stats/neighborhood_recall.cu
+    stats/r2_score.cu
+    stats/rand_index.cu
+    stats/regression_metrics.cu
+    stats/silhouette_score.cu
+    stats/stddev.cu
+    stats/sum.cu
+    stats/trustworthiness.cu
+    stats/weighted_mean.cu
+    stats/v_measure.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -497,15 +498,15 @@ if(BUILD_TESTS)
     NAME
     UTILS_TEST
     PATH
-    test/core/seive.cu
-    test/util/bitonic_sort.cu
-    test/util/cudart_utils.cpp
-    test/util/device_atomics.cu
-    test/util/integer_utils.cpp
-    test/util/integer_utils.cu
-    test/util/memory_type_dispatcher.cu
-    test/util/pow2_utils.cu
-    test/util/reduction.cu
+    core/seive.cu
+    util/bitonic_sort.cu
+    util/cudart_utils.cpp
+    util/device_atomics.cu
+    util/integer_utils.cpp
+    util/integer_utils.cu
+    util/memory_type_dispatcher.cu
+    util/pow2_utils.cu
+    util/reduction.cu
   )
 endif()
 
diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp
index c7c9e175ea..c63d5896e5 100644
--- a/cpp/test/core/device_resources_manager.cpp
+++ b/cpp/test/core/device_resources_manager.cpp
@@ -21,6 +21,7 @@
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -114,17 +115,10 @@ TEST(DeviceResourcesManager, ObeysSetters)
 
     auto* mr = dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(
       rmm::mr::get_current_device_resource());
-    auto* workspace_mr =
-      dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(
-        dynamic_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(
-          res.get_workspace_resource())
-          ->get_upstream());
+
     if (upstream_mrs[i % devices.size()] != nullptr) {
       // Expect that the current memory resource is a pool memory resource as requested
       EXPECT_NE(mr, nullptr);
-      // Expect that the upstream workspace memory resource is a pool memory
-      // resource as requested
-      EXPECT_NE(workspace_mr, nullptr);
     }
 
     {
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
index 0b0b4b54ab..be18b0d5b4 100644
--- a/cpp/test/core/handle.cpp
+++ b/cpp/test/core/handle.cpp
@@ -25,6 +25,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime.h>
 
@@ -281,7 +282,8 @@ TEST(Raft, WorkspaceResource)
   raft::handle_t handle;
 
   // The returned resource is always a limiting adaptor
-  auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream();
+  rmm::device_async_resource_ref orig_mr{
+    resource::get_workspace_resource(handle)->get_upstream_resource()};
 
   // Let's create a pooled resource
   auto pool_mr = std::shared_ptr<rmm::mr::device_memory_resource>{new rmm::mr::pool_memory_resource(
@@ -295,8 +297,8 @@ TEST(Raft, WorkspaceResource)
   auto new_mr = resource::get_workspace_resource(handle);
 
   // By this point, the orig_mr likely points to a non-existent resource; don't dereference!
-  ASSERT_NE(orig_mr, new_mr);
-  ASSERT_EQ(pool_mr.get(), new_mr->get_upstream());
+  ASSERT_NE(orig_mr, rmm::device_async_resource_ref{new_mr});
+  ASSERT_EQ(rmm::device_async_resource_ref{pool_mr.get()}, new_mr->get_upstream_resource());
   // We can safely reset pool_mr, because the shared_ptr to the pool memory stays in the resource
   pool_mr.reset();
 
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/test/ext_headers/00_generate.py
index 682cadbe89..1e1106f8bf 100644
--- a/cpp/test/ext_headers/00_generate.py
+++ b/cpp/test/ext_headers/00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 
 copyright_notice = """
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,12 +49,12 @@
     "raft/spatial/knn/detail/fused_l2_knn-ext.cuh",
     "raft/distance/fused_l2_nn-ext.cuh",
     "raft/neighbors/ivf_pq-ext.cuh",
-    "raft/util/memory_pool-ext.hpp",
     "raft/neighbors/ivf_flat-ext.cuh",
     "raft/core/logger-ext.hpp",
     "raft/neighbors/refine-ext.cuh",
     "raft/neighbors/detail/ivf_flat_search-ext.cuh",
     "raft/linalg/detail/coalesced_reduction-ext.cuh",
+    "raft/sparse/matrix/detail/select_k-ext.cuh",
     "raft/spatial/knn/detail/ball_cover/registers-ext.cuh",
     "raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh",
     "raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh",
diff --git a/cpp/test/ext_headers/raft_util_memory_pool.cpp b/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
similarity index 87%
rename from cpp/test/ext_headers/raft_util_memory_pool.cpp
rename to cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
index 11a024b958..b748a31a5b 100644
--- a/cpp/test/ext_headers/raft_util_memory_pool.cpp
+++ b/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,4 +24,4 @@
  *
  */
 
-#include <raft/util/memory_pool.hpp>
+#include <raft/sparse/matrix/detail/select_k.cuh>
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 2061f28d36..28f5ff5f60 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -39,7 +39,8 @@ struct coalescedReductionInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.seed;
 }
 
 // Or else, we get the following compilation error
@@ -113,15 +114,40 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
   rmm::device_uvector<T> dots_act;
 };
 
-const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 1024, 32, 1234ULL},
-                                                              {0.000002f, 1024, 64, 1234ULL},
-                                                              {0.000002f, 1024, 128, 1234ULL},
-                                                              {0.000002f, 1024, 256, 1234ULL}};
-
-const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
-                                                               {0.000000001, 1024, 64, 1234ULL},
-                                                               {0.000000001, 1024, 128, 1234ULL},
-                                                               {0.000000001, 1024, 256, 1234ULL}};
+// Note: it's important to have a variety of rows/columns combinations to test all possible code
+// paths: thin (few cols or many rows), medium, thick (many cols, very few rows).
+
+const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 50, 2, 1234ULL},
+                                                              {0.000002f, 50, 3, 1234ULL},
+                                                              {0.000002f, 50, 7, 1234ULL},
+                                                              {0.000002f, 50, 9, 1234ULL},
+                                                              {0.000002f, 50, 20, 1234ULL},
+                                                              {0.000002f, 50, 55, 1234ULL},
+                                                              {0.000002f, 50, 100, 1234ULL},
+                                                              {0.000002f, 50, 270, 1234ULL},
+                                                              {0.000002f, 10000, 3, 1234ULL},
+                                                              {0.000002f, 10000, 9, 1234ULL},
+                                                              {0.000002f, 10000, 20, 1234ULL},
+                                                              {0.000002f, 10000, 55, 1234ULL},
+                                                              {0.000002f, 10000, 100, 1234ULL},
+                                                              {0.000002f, 10000, 270, 1234ULL},
+                                                              {0.0001f, 10, 25000, 1234ULL}};
+
+const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 50, 2, 1234ULL},
+                                                               {0.000000001, 50, 3, 1234ULL},
+                                                               {0.000000001, 50, 7, 1234ULL},
+                                                               {0.000000001, 50, 9, 1234ULL},
+                                                               {0.000000001, 50, 20, 1234ULL},
+                                                               {0.000000001, 50, 55, 1234ULL},
+                                                               {0.000000001, 50, 100, 1234ULL},
+                                                               {0.000000001, 50, 270, 1234ULL},
+                                                               {0.000000001, 10000, 3, 1234ULL},
+                                                               {0.000000001, 10000, 9, 1234ULL},
+                                                               {0.000000001, 10000, 20, 1234ULL},
+                                                               {0.000000001, 10000, 55, 1234ULL},
+                                                               {0.000000001, 10000, 100, 1234ULL},
+                                                               {0.000000001, 10000, 270, 1234ULL},
+                                                               {0.0000001, 10, 25000, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
 TEST_P(coalescedReductionTestF, Result)
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/test/matrix/select_k.cuh
index 7f9b7b3fc3..f22f4f5fa7 100644
--- a/cpp/test/matrix/select_k.cuh
+++ b/cpp/test/matrix/select_k.cuh
@@ -25,7 +25,6 @@
 #include <raft_internal/matrix/select_k.cuh>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 7278f71a24..cc787d3e57 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -26,10 +26,13 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
+#include <raft/linalg/normalize.cuh>
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
 #include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <raft_internal/neighbors/naive_knn.cuh>
@@ -85,25 +88,49 @@ void RandomSuffle(raft::host_matrix_view<IdxT, int64_t> index)
 
 template <typename DistanceT, typename DatatT, typename IdxT>
 testing::AssertionResult CheckOrder(raft::host_matrix_view<IdxT, int64_t> index_test,
-                                    raft::host_matrix_view<DatatT, int64_t> dataset)
+                                    raft::host_matrix_view<DatatT, int64_t> dataset,
+                                    raft::distance::DistanceType metric)
 {
   for (IdxT i = 0; i < index_test.extent(0); i++) {
     const DatatT* const base_vec = dataset.data_handle() + i * dataset.extent(1);
     const IdxT* const index_row  = index_test.data_handle() + i * index_test.extent(1);
-    DistanceT prev_distance      = 0;
+    DistanceT prev_distance      = metric == raft::distance::DistanceType::L2Expanded
+                                     ? 0
+                                     : std::numeric_limits<DistanceT>::max();
     for (unsigned j = 0; j < index_test.extent(1) - 1; j++) {
       const DatatT* const target_vec = dataset.data_handle() + index_row[j] * dataset.extent(1);
       DistanceT distance             = 0;
-      for (unsigned l = 0; l < dataset.extent(1); l++) {
-        const auto diff =
-          static_cast<DistanceT>(target_vec[l]) - static_cast<DistanceT>(base_vec[l]);
-        distance += diff * diff;
-      }
-      if (prev_distance > distance) {
-        return testing::AssertionFailure()
-               << "Wrong index order (row = " << i << ", neighbor_id = " << j
-               << "). (distance[neighbor_id-1] = " << prev_distance
-               << "should be larger than distance[neighbor_id] = " << distance << ")";
+      switch (metric) {
+        case raft::distance::DistanceType::L2Expanded:
+          for (unsigned l = 0; l < dataset.extent(1); l++) {
+            const auto diff =
+              static_cast<DistanceT>(target_vec[l]) - static_cast<DistanceT>(base_vec[l]);
+            distance += diff * diff;
+          }
+          if (prev_distance > distance) {
+            return testing::AssertionFailure()
+                   << "Wrong index order (row = " << i << ", neighbor_id = " << j
+                   << "). (distance[neighbor_id-1] = " << prev_distance
+                   << "should be lesser than distance[neighbor_id] = " << distance << ")";
+          }
+          break;
+        case raft::distance::DistanceType::InnerProduct:
+          for (unsigned l = 0; l < dataset.extent(1); l++) {
+            const auto prod =
+              static_cast<DistanceT>(target_vec[l]) * static_cast<DistanceT>(base_vec[l]);
+            distance += prod;
+          }
+          if (prev_distance < distance) {
+            return testing::AssertionFailure()
+                   << "Wrong index order (row = " << i << ", neighbor_id = " << j
+                   << "). (distance[neighbor_id-1] = " << prev_distance
+                   << "should be greater than distance[neighbor_id] = " << distance << ")";
+          }
+          break;
+        default:
+          return testing::AssertionFailure()
+                 << "Distance metric " << metric
+                 << " not supported. Only L2Expanded and InnerProduct are supported";
       }
       prev_distance = distance;
     }
@@ -175,6 +202,67 @@ void GenerateRoundingErrorFreeDataset(
   GenerateRoundingErrorFreeDataset_kernel<T>
     <<<grid_size, block_size, 0, cuda_stream>>>(ptr, size, resolution);
 }
+
+template <class DataT>
+void InitDataset(const raft::resources& handle,
+                 DataT* const datatset_ptr,
+                 std::uint32_t size,
+                 std::uint32_t dim,
+                 raft::distance::DistanceType metric,
+                 raft::random::RngState& r)
+{
+  if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
+    GenerateRoundingErrorFreeDataset(handle, datatset_ptr, size, dim, r, true);
+
+    if (metric == raft::distance::InnerProduct) {
+      auto dataset_view = raft::make_device_matrix_view(datatset_ptr, size, dim);
+      raft::linalg::row_normalize(
+        handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
+    }
+  } else if constexpr (std::is_same_v<DataT, std::uint8_t> || std::is_same_v<DataT, std::int8_t>) {
+    if constexpr (std::is_same_v<DataT, std::int8_t>) {
+      raft::random::uniformInt(handle, r, datatset_ptr, size * dim, DataT(-10), DataT(10));
+    } else {
+      raft::random::uniformInt(handle, r, datatset_ptr, size * dim, DataT(1), DataT(20));
+    }
+
+    if (metric == raft::distance::InnerProduct) {
+      // TODO (enp1s0): Change this once row_normalize supports (u)int8 matrices.
+      // https://github.com/rapidsai/raft/issues/2291
+
+      using ComputeT    = float;
+      auto dataset_view = raft::make_device_matrix_view(datatset_ptr, size, dim);
+      auto dev_row_norm = raft::make_device_vector<ComputeT>(handle, size);
+      const auto normalized_norm =
+        (std::is_same_v<DataT, std::uint8_t> ? 40 : 20) * std::sqrt(static_cast<ComputeT>(dim));
+
+      raft::linalg::reduce(dev_row_norm.data_handle(),
+                           datatset_ptr,
+                           dim,
+                           size,
+                           0.f,
+                           true,
+                           true,
+                           resource::get_cuda_stream(handle),
+                           false,
+                           raft::sq_op(),
+                           raft::add_op(),
+                           raft::sqrt_op());
+      raft::linalg::matrix_vector_op(
+        handle,
+        raft::make_const_mdspan(dataset_view),
+        raft::make_const_mdspan(dev_row_norm.view()),
+        dataset_view,
+        raft::linalg::Apply::ALONG_COLUMNS,
+        [normalized_norm] __device__(DataT elm, ComputeT norm) {
+          const ComputeT v           = elm / norm * normalized_norm;
+          const ComputeT max_v_range = std::numeric_limits<DataT>::max();
+          const ComputeT min_v_range = std::numeric_limits<DataT>::min();
+          return static_cast<DataT>(std::min(max_v_range, std::max(min_v_range, v)));
+        });
+    }
+  }
+}
 }  // namespace
 
 struct AnnCagraInputs {
@@ -221,6 +309,11 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagra()
   {
+    // TODO (tarang-jain): remove when NN Descent index building support InnerProduct. Reference
+    // issue: https://github.com/rapidsai/raft/issues/2276
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -301,6 +394,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
       //   print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout);
       //   print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout);
       // }
+
       double min_recall = ps.min_recall;
       EXPECT_TRUE(eval_neighbours(indices_naive,
                                   indices_Cagra,
@@ -329,16 +423,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
     search_queries.resize(ps.n_queries * ps.dim, stream_);
     raft::random::RngState r(1234ULL);
-    if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
-      GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, true);
-      GenerateRoundingErrorFreeDataset(
-        handle_, search_queries.data(), ps.n_queries, ps.dim, r, true);
-    } else {
-      raft::random::uniformInt(
-        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
-      raft::random::uniformInt(
-        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20));
-    }
+    InitDataset(handle_, database.data(), ps.n_rows, ps.dim, ps.metric, r);
+    InitDataset(handle_, search_queries.data(), ps.n_queries, ps.dim, ps.metric, r);
     resource::sync_stream(handle_);
   }
 
@@ -368,6 +454,9 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagraSort()
   {
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     {
       // Step 1: Build a sorted KNN graph by CAGRA knn build
       auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
@@ -383,10 +472,13 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
         raft::make_host_matrix<IdxT, int64_t>(ps.n_rows, index_params.intermediate_graph_degree);
 
       if (ps.build_algo == graph_build_algo::IVF_PQ) {
+        auto build_params = ivf_pq::index_params::from_dataset(database_view, ps.metric);
         if (ps.host_dataset) {
-          cagra::build_knn_graph<DataT, IdxT>(handle_, database_host_view, knn_graph.view());
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_host_view, knn_graph.view(), 2, build_params);
         } else {
-          cagra::build_knn_graph<DataT, IdxT>(handle_, database_view, knn_graph.view());
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_view, knn_graph.view(), 2, build_params);
         }
       } else {
         auto nn_descent_idx_params                      = experimental::nn_descent::index_params{};
@@ -403,14 +495,16 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
       }
 
       handle_.sync_stream();
-      ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view()));
+      ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view(), ps.metric));
 
-      RandomSuffle(knn_graph.view());
+      if (ps.metric != raft::distance::DistanceType::InnerProduct) {
+        RandomSuffle(knn_graph.view());
 
-      cagra::sort_knn_graph(handle_, database_view, knn_graph.view());
-      handle_.sync_stream();
+        cagra::sort_knn_graph(handle_, database_view, knn_graph.view());
+        handle_.sync_stream();
 
-      ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view()));
+        ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view(), ps.metric));
+      }
     }
   }
 
@@ -453,6 +547,9 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagraFilter()
   {
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -575,6 +672,9 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
 
   void testCagraRemoved()
   {
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -699,16 +799,8 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
     search_queries.resize(ps.n_queries * ps.dim, stream_);
     raft::random::RngState r(1234ULL);
-    if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
-      GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, true);
-      GenerateRoundingErrorFreeDataset(
-        handle_, search_queries.data(), ps.n_queries, ps.dim, r, true);
-    } else {
-      raft::random::uniformInt(
-        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
-      raft::random::uniformInt(
-        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20));
-    }
+    InitDataset(handle_, database.data(), ps.n_rows, ps.dim, ps.metric, r);
+    InitDataset(handle_, search_queries.data(), ps.n_queries, ps.dim, ps.metric, r);
     resource::sync_stream(handle_);
   }
 
@@ -741,7 +833,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},
     {256},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {0.995});
@@ -757,7 +849,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},
     {256},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {99. / 100}
@@ -776,7 +868,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},
     {64},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {0.995});
@@ -792,7 +884,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0, 4, 8, 16, 32},  // team_size
     {64},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {false},
     {0.995});
@@ -809,7 +901,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},  // team_size
     {32, 64, 128, 256, 512, 768},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {0.995});
@@ -826,27 +918,27 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},  // team_size
     {64},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false, true},
     {false},
     {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {20000},
-                                                   {32},
-                                                   {2048},  // k
-                                                   {graph_build_algo::NN_DESCENT},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},
-                                                   {4096},  // itopk_size
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {false},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {20000},
+    {32},
+    {2048},  // k
+    {graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0},
+    {4096},  // itopk_size
+    {1},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
+    {false},
+    {false},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   return inputs;
diff --git a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
index 5cca6d561a..412e71bff1 100644
--- a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
+++ b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
@@ -51,6 +51,7 @@ namespace multi_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 instantiate_kernel_selection(standard_dataset_descriptor_t,
@@ -118,6 +119,7 @@ namespace single_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 instantiate_single_cta_select_and_run(standard_dataset_descriptor_t,
diff --git a/cpp/test/neighbors/ann_cagra_vpq.cuh b/cpp/test/neighbors/ann_cagra_vpq.cuh
old mode 100755
new mode 100644
index 503b1a413a..6b24bca921
--- a/cpp/test/neighbors/ann_cagra_vpq.cuh
+++ b/cpp/test/neighbors/ann_cagra_vpq.cuh
@@ -158,7 +158,7 @@ class AnnCagraVpqTest : public ::testing::TestWithParam<AnnCagraVpqInputs> {
       resource::sync_stream(handle_);
     }
 
-    const auto vpq_k = ps.k * 16;
+    const auto vpq_k = ps.k * 4;
     {
       rmm::device_uvector<DistanceT> distances_dev(vpq_k * ps.n_queries, stream_);
       rmm::device_uvector<IdxT> indices_dev(vpq_k * ps.n_queries, stream_);
@@ -319,7 +319,7 @@ const std::vector<AnnCagraVpqInputs> vpq_inputs = raft::util::itertools::product
   {1000, 10000},                                      // n_rows
   {128, 132, 192, 256, 512, 768},                     // dim
   {8, 12},                                            // k
-  {2},                                                // pq_len
+  {2, 4},                                             // pq_len
   {8},                                                // pq_bits
   {graph_build_algo::NN_DESCENT},                     // build_algo
   {search_algo::SINGLE_CTA, search_algo::MULTI_CTA},  // algo
diff --git a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
index 942d0fcc44..00baa59f58 100644
--- a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
@@ -37,8 +37,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(float, uint32_t);
 
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 3e0bead665..2139e97428 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -28,10 +28,6 @@
 
 #include <raft_internal/neighbors/naive_knn.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-
 #include <gtest/gtest.h>
 
 #include <iostream>
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index 62bad8e543..bed9515a53 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -25,6 +25,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <gtest/gtest.h>
 
@@ -287,10 +288,8 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
     raft::device_matrix_view<T, int, raft::col_major> P_view(P_d.data(), dim, dim);
     raft::device_matrix_view<T, int, raft::col_major> X_view(X_d.data(), dim, nPoints);
 
-    rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource();
-    ASSERT_TRUE(mem_resource_ptr != nullptr);
     raft::random::multi_variable_gaussian(
-      handle, *mem_resource_ptr, x_view, P_view, X_view, method);
+      handle, rmm::mr::get_current_device_resource(), x_view, P_view, X_view, method);
 
     // saving the mean of the randoms in Rand_mean
     //@todo can be swapped with a API that calculates mean
diff --git a/cpp/test/sparse/select_k_csr.cu b/cpp/test/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..fc1061d7bb
--- /dev/null
+++ b/cpp/test/sparse/select_k_csr.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace sparse {
+
+template <typename index_t>
+struct SelectKCsrInputs {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min;
+  bool customized_indices;
+};
+
+template <typename T>
+struct CompareApproxWithInf {
+  CompareApproxWithInf(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    if (std::isinf(a) && std::isinf(b)) return true;
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename value_t, typename index_t>
+class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>> {
+ public:
+  SelectKCsrTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<SelectKCsrInputs<index_t>>::GetParam()),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      dst_values_d(0, stream),
+      dst_values_expected_d(0, stream),
+      dst_indices_d(0, stream),
+      dst_indices_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_select_k(const std::vector<index_t>& indptr_h,
+                    const std::vector<index_t>& indices_h,
+                    const std::vector<value_t>& values_h,
+                    std::optional<std::vector<index_t>>& in_idx_h,
+                    index_t n_rows,
+                    index_t n_cols,
+                    index_t top_k,
+                    std::vector<value_t>& out_values_h,
+                    std::vector<index_t>& out_indices_h,
+                    bool select_min = true)
+  {
+    auto comp = [select_min](const std::pair<value_t, index_t>& a,
+                             const std::pair<value_t, index_t>& b) {
+      return select_min ? a.first < b.first : a.first >= b.first;
+    };
+
+    for (index_t row = 0; row < n_rows; ++row) {
+      std::priority_queue<std::pair<value_t, index_t>,
+                          std::vector<std::pair<value_t, index_t>>,
+                          decltype(comp)>
+        pq(comp);
+
+      for (index_t idx = indptr_h[row]; idx < indptr_h[row + 1]; ++idx) {
+        pq.push({values_h[idx], (in_idx_h.has_value()) ? (*in_idx_h)[idx] : indices_h[idx]});
+        if (pq.size() > size_t(top_k)) { pq.pop(); }
+      }
+
+      std::vector<std::pair<value_t, index_t>> row_pairs;
+      while (!pq.empty()) {
+        row_pairs.push_back(pq.top());
+        pq.pop();
+      }
+
+      if (select_min) {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first <= b.first;
+        });
+      } else {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first >= b.first;
+        });
+      }
+      for (index_t col = 0; col < top_k; col++) {
+        if (col < index_t(row_pairs.size())) {
+          out_values_h[row * top_k + col]  = row_pairs[col].first;
+          out_indices_h[row * top_k + col] = row_pairs[col].second;
+        }
+      }
+    }
+  }
+
+  void random_array(value_t* array, size_t size)
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<value_t> dis(-10.0, 10.0);
+    std::unordered_set<value_t> uset;
+
+    while (uset.size() < size) {
+      uset.insert(dis(gen));
+    }
+    typename std::unordered_set<value_t>::iterator it = uset.begin();
+    for (size_t i = 0; i < size; ++i) {
+      array[i] = *(it++);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void SetUp() override
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<value_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k,
+                                      std::numeric_limits<value_t>::infinity());
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k, static_cast<index_t>(0));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+
+    resource::sync_stream(handle);
+
+    if (values_h.size()) {
+      random_array(values_h.data(), values_h.size());
+      raft::copy(values_d.data(), values_h.data(), values_h.size(), stream);
+      resource::sync_stream(handle);
+    }
+
+    auto optional_indices_h = get_opt_var(customized_indices_h);
+
+    cpu_select_k(indptr_h,
+                 indices_h,
+                 values_h,
+                 optional_indices_h,
+                 params.n_rows,
+                 params.n_cols,
+                 params.top_k,
+                 dst_values_h,
+                 dst_indices_h,
+                 params.select_min);
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(
+      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(
+      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+
+    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
+                                           out_idx.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           raft::Compare<index_t>(),
+                                           stream));
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           out_val.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           CompareApproxWithInf<value_t>(1e-6f),
+                                           stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  SelectKCsrInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<value_t> dst_values_expected_d;
+
+  rmm::device_uvector<index_t> dst_indices_d;
+  rmm::device_uvector<index_t> dst_indices_expected_d;
+};
+
+using SelectKCsrTest_float_int = SelectKCsrTest<float, int>;
+TEST_P(SelectKCsrTest_float_int, Result) { Run(); }
+
+using SelectKCsrTest_double_int64 = SelectKCsrTest<double, int64_t>;
+TEST_P(SelectKCsrTest_double_int64, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
+  {10, 32, 10, 0.0, true, false},
+  {10, 32, 10, 0.0, true, true},
+  {10, 32, 10, 0.01, true, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, true, true},
+  {10, 32, 251, 0.1, true, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, true, true},
+  {1000, 1024 * 100, 1, 0.1, true, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, true, true},
+  {1024, 1024, 258, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, true, true},
+  {1024, 1024, 1024, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, true, true},
+  {100, 1024 * 1000, 251, 0.1, true, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, true, true},
+  {1024, 1024 * 10, 251, 0.3, true, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, true, true},
+  {1000, 1024 * 20, 1000, 0.2, true, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 1000, 0.2, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 2100, 0.1, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, true, true},
+  {10, 32, 10, 0.0, false, false},
+  {10, 32, 10, 0.0, false, true},
+  {10, 32, 10, 0.01, false, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, false, true},
+  {10, 32, 251, 0.1, false, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, false, true},
+  {1000, 1024 * 100, 1, 0.1, false, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, false, true},
+  {1024, 1024, 258, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, false, true},
+  {1024, 1024, 1024, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, false, true},
+  {100, 1024 * 1000, 251, 0.1, false, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, false, true},
+  {1024, 1024 * 10, 251, 0.3, false, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, false, true},
+  {1000, 1024 * 20, 1000, 0.2, false, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 1000, 0.2, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 2100, 0.1, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, false, true}};
+
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_float_int,
+                        ::testing::ValuesIn(selectk_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_double_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu
index c5bb0ad3b6..086d1f4152 100644
--- a/cpp/test/util/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 836775a5a3..98fc7fa8fc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,6 +10,8 @@ files:
       - build_pylibraft
       - cuda
       - cuda_version
+      - depends_on_cupy
+      - depends_on_distributed_ucxx
       - develop
       - checks
       - build_wheels
@@ -19,7 +21,6 @@ files:
       - run_pylibraft
       - test_python_common
       - test_pylibraft
-      - cupy
   bench_ann:
     output: conda
     matrix:
@@ -44,7 +45,7 @@ files:
       - py_version
       - test_python_common
       - test_pylibraft
-      - cupy
+      - depends_on_cupy
   checks:
     output: none
     includes:
@@ -54,7 +55,7 @@ files:
     output: none
     includes:
       - cuda_version
-      - cupy
+      - depends_on_cupy
       - docs
       - py_version
       - test_pylibraft
@@ -82,7 +83,7 @@ files:
     includes:
       - test_python_common
       - test_pylibraft
-      - cupy
+      - depends_on_cupy
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -90,6 +91,7 @@ files:
       table: build-system
     includes:
       - build
+      - depends_on_ucx_build
   py_run_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -97,6 +99,7 @@ files:
       table: project
     includes:
       - run_raft_dask
+      - depends_on_distributed_ucxx
   py_test_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -138,6 +141,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - nccl>=2.9.9
+          - libucxx==0.38.*
           - scikit-build-core>=0.7.0
       - output_types: [requirements, pyproject]
         packages:
@@ -180,7 +184,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_conda rmm==24.4.*
+          - &rmm_conda rmm==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -201,10 +205,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &rmm_cu12 rmm-cu12==24.4.*
+              - &rmm_cu12 rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - &rmm_cu11 rmm-cu11==24.4.*
+              - &rmm_cu11 rmm-cu11==24.6.*
           - {matrix: null, packages: [*rmm_conda] }
   checks:
     common:
@@ -337,7 +341,7 @@ dependencies:
               - *libcusparse_dev114
               - *libcusparse114
 
-  cupy:
+  depends_on_cupy:
     common:
       - output_types: conda
         packages:
@@ -358,8 +362,6 @@ dependencies:
       - output_types: [conda]
         packages:
           - *cmake_ver
-          - gtest>=1.13.0
-          - gmock>=1.13.0
   docs:
     common:
       - output_types: [conda]
@@ -435,20 +437,18 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - rapids-dask-dependency==24.4.*
-          - ucx-py==0.37.*
+          - rapids-dask-dependency==24.6.*
+          - ucx-py==0.38.*
       - output_types: conda
         packages:
-          - ucx>=1.15.0,<1.16.0
-          - ucx-proc=*=gpu
-          - &ucx_py_conda ucx-py==0.37.*
+          - &ucx_py_conda ucx-py==0.38.*
       - output_types: pyproject
         packages:
-          - &pylibraft_conda pylibraft==24.4.*
+          - &pylibraft_conda pylibraft==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -460,12 +460,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pylibraft_cu12 pylibraft-cu12==24.4.*
-              - &ucx_py_cu12 ucx-py-cu12==0.37.*
+              - &pylibraft_cu12 pylibraft-cu12==24.6.*
+              - &ucx_py_cu12 ucx-py-cu12==0.38.*
           - matrix: {cuda: "11.*"}
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.4.*
-              - &ucx_py_cu11 ucx-py-cu11==0.37.*
+              - &pylibraft_cu11 pylibraft-cu11==24.6.*
+              - &ucx_py_cu11 ucx-py-cu11==0.38.*
           - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
@@ -479,3 +479,69 @@ dependencies:
         packages:
           - scikit-learn
           - scipy
+  depends_on_distributed_ucxx:
+    common:
+      - output_types: conda
+        packages:
+          # UCXX is not currently a hard-dependency thus only installed during tests,
+          # this will change in the future.
+          - &distributed_ucxx_conda distributed-ucxx==0.38.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - distributed-ucxx-cu12==0.38.*
+          - matrix: {cuda: "11.*"}
+            packages:
+              - distributed-ucxx-cu11==0.38.*
+          - {matrix: null, packages: [*distributed_ucxx_conda]}
+  depends_on_ucx_build:
+    common:
+      - output_types: conda
+        packages:
+          - &ucx_conda_build ucx==1.15.0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - libucx-cu12==1.15.0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - libucx-cu11==1.15.0
+          - matrix: null
+            packages:
+              - libucx==1.15.0
+  depends_on_ucx_run:
+    common:
+      - output_types: conda
+        packages:
+          - &ucx_conda_run ucx>=1.15.0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - libucx-cu12>=1.15.0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - libucx-cu11>=1.15.0
+          - matrix: null
+            packages:
+              - libucx>=1.15.0
diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md
index 821345b07c..26c1559504 100644
--- a/docs/source/ann_benchmarks_dataset.md
+++ b/docs/source/ann_benchmarks_dataset.md
@@ -52,12 +52,12 @@ If you have a dataset, but no corresponding ground truth file, then you can gene
 
 ```bash
 # With existing query file
-python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
 
 # With randomly generated queries
-python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
+python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
 
 # Using only a subset of the dataset. Define queries by randomly
 # selecting vectors from the (subset of the) dataset.
-python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
+python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
 ```
\ No newline at end of file
diff --git a/docs/source/ann_benchmarks_low_level.md b/docs/source/ann_benchmarks_low_level.md
index 55238954ba..7ba13dec8d 100644
--- a/docs/source/ann_benchmarks_low_level.md
+++ b/docs/source/ann_benchmarks_low_level.md
@@ -8,7 +8,7 @@ cd raft
 
 # (1) prepare a dataset
 export PYTHONPATH=python/raft-ann-bench/src:$PYTHONPATH
-python -m raft-ann-bench.get_dataset --dataset glove-100-angular --normalize
+python -m raft_ann_bench.get_dataset --dataset glove-100-angular --normalize
 
 # option --normalize is used here to normalize vectors so cosine distance is converted
 # to inner product; don't use -n for l2 distance
@@ -18,7 +18,7 @@ $CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH \
   --data_prefix=datasets \
   --build \
   --benchmark_filter="raft_ivf_flat\..*" \
-  python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json 
+  python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json 
 
 # (3) search
 $CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH\
@@ -29,7 +29,7 @@ $CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH\
   --benchmark_counters_tabular \
   --search \
   --benchmark_filter="raft_ivf_flat\..*" \
-    python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json 
+    python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json 
 
 
 # optional step: plot QPS-Recall figure using data in ivf_flat_search.csv with your favorite tool
@@ -43,12 +43,12 @@ A dataset usually has 4 binary files containing database vectors, query vectors,
 The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
 These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
 
-Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
 
 Commonly used datasets can be downloaded from two websites:
 1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
 
-    However, these datasets are in HDF5 format. Use `python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    However, these datasets are in HDF5 format. Use `python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
     ```bash
     pip3 install numpy h5py
     ```
@@ -68,7 +68,7 @@ Commonly used datasets can be downloaded from two websites:
 
 2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
     ```bash
-    $ python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl
+    $ python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl
     usage: split_groundtruth.pl input output_prefix
     ```
     Take Deep-1B dataset as an example:
@@ -78,7 +78,7 @@ Commonly used datasets can be downloaded from two websites:
     mkdir -p data/deep-1B && cd data/deep-1B
     # download manually "Ground Truth" file of "Yandex DEEP"
     # suppose the file name is deep_new_groundtruth.public.10K.bin
-    /path/to/raft/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    /path/to/raft/python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
     popd
     ```
diff --git a/docs/source/build.md b/docs/source/build.md
index 7bb6cf515a..c0abf3f995 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index d29130add0..5e288e9f2f 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -215,13 +215,14 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list
 ```
 
 ### Copyright header
-[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.04/ci/checks/copyright.py) checks the Copyright header for all git-modified files
+RAPIDS [pre-commit-hooks](https://github.com/rapidsai/pre-commit-hooks) checks the Copyright
+header for all git-modified files.
 
-Manually, you can run the following to bulk-fix the header if only the years need to be updated:
+Manually, you can run the following to bulk-fix the header on all files in the repository:
 ```bash
-python ./ci/checks/copyright.py --update-current-year
+pre-commit run -a verify-copyright
 ```
-Keep in mind that this only applies to files tracked by git and having been modified.
+Keep in mind that this only applies to files tracked by git that have been modified.
 
 ## Error handling
 Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` and `RAFT_CUSOLVER_TRY`. These macros take care of checking the return values of the used API calls and generate an exception when the command is not successful. If you need to avoid an exception, e.g. inside a destructor, use `RAFT_CUDA_TRY_NO_THROW`, `RAFT_CUBLAS_TRY_NO_THROW ` and `RAFT_CUSOLVER_TRY_NO_THROW`. These macros log the error but do not throw an exception.
@@ -229,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 68fe80f9ce..4b3aef5600 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.04a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:24.06a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.04-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.06-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -96,7 +96,7 @@ We provide a collection of lightweight Python scripts to run the benchmarks. The
 4. Plot Results
 
 ### Step 1: Prepare Dataset
-The script `raft-ann-bench.get_dataset` will download and unpack the dataset in directory
+The script `raft_ann_bench.get_dataset` will download and unpack the dataset in directory
 that the user provides. As of now, only million-scale datasets are supported by this
 script. For more information on [datasets and formats](ann_benchmarks_dataset.md).
 
@@ -117,10 +117,10 @@ will be normalized to inner product. So, for example, the dataset `glove-100-ang
 will be written at location `datasets/glove-100-inner/`.
 
 ### Step 2: Build and Search Index
-The script `raft-ann-bench.run` will build and search indices for a given dataset and its
+The script `raft_ann_bench.run` will build and search indices for a given dataset and its
 specified configuration.
 
-The usage of the script `raft-ann-bench.run` is:
+The usage of the script `raft_ann_bench.run` is:
 ```bash
 usage: __main__.py [-h] [--subset-size SUBSET_SIZE] [-k COUNT] [-bs BATCH_SIZE] [--dataset-configuration DATASET_CONFIGURATION] [--configuration CONFIGURATION] [--dataset DATASET]
                    [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-f] [-m SEARCH_MODE]
@@ -186,8 +186,8 @@ it is assumed both are `True`.
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
 ### Step 3: Data Export
-The script `raft-ann-bench.data_export` will convert the intermediate JSON outputs produced by `raft-ann-bench.run` to more
-easily readable CSV files, which are needed to build charts made by `raft-ann-bench.plot`.
+The script `raft_ann_bench.data_export` will convert the intermediate JSON outputs produced by `raft_ann_bench.run` to more
+easily readable CSV files, which are needed to build charts made by `raft_ann_bench.plot`.
 
 ```bash
 usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH]
@@ -206,7 +206,7 @@ and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<
 
 
 ### Step 4: Plot Results
-The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
+The script `raft_ann_bench.plot` will plot results for all algorithms found in index search statistics
 CSV files `<dataset-path/<dataset>/result/search/*.csv`.
 
 The usage of this script is:
@@ -265,19 +265,19 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
 ```bash
 
 # (1) prepare dataset.
-python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
+python -m raft_ann_bench.get_dataset --dataset deep-image-96-angular --normalize
 
 # (2) build and search index
-python -m raft-ann-bench.run --dataset deep-image-96-inner --algorithms raft_cagra --batch-size 10 -k 10
+python -m raft_ann_bench.run --dataset deep-image-96-inner --algorithms raft_cagra --batch-size 10 -k 10
 
 # (3) export data
-python -m raft-ann-bench.data_export --dataset deep-image-96-inner
+python -m raft_ann_bench.data_export --dataset deep-image-96-inner
 
 # (4) plot results
-python -m raft-ann-bench.plot --dataset deep-image-96-inner
+python -m raft_ann_bench.plot --dataset deep-image-96-inner
 ```
 
-Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft-ann-bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
+Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft_ann_bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
 
 | Dataset Name | Train Rows | Columns | Test Rows      | Distance   | 
 |-----|------------|----|----------------|------------|
@@ -293,7 +293,7 @@ All of the datasets above contain ground test datasets with 100 neighbors. Thus
 
 ### End to end: large-scale benchmarks (>10M vectors)
 
-`raft-ann-bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
+`raft_ann_bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
 due to their size. You should instead use our billion-scale datasets guide to download and prepare them.
 All other python commands mentioned below work as intended once the
 billion-scale dataset has been downloaded.
@@ -308,20 +308,20 @@ mkdir -p datasets/deep-1B
 # (1) prepare dataset
 # download manually "Ground Truth" file of "Yandex DEEP"
 # suppose the file name is deep_new_groundtruth.public.10K.bin
-python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
+python -m raft_ann_bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
 # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
 # (2) build and search index
-python -m raft-ann-bench.run --dataset deep-1B --algorithms raft_cagra --batch-size 10 -k 10
+python -m raft_ann_bench.run --dataset deep-1B --algorithms raft_cagra --batch-size 10 -k 10
 
 # (3) export data
-python -m raft-ann-bench.data_export --dataset deep-1B
+python -m raft_ann_bench.data_export --dataset deep-1B
 
 # (4) plot results
-python -m raft-ann-bench.plot --dataset deep-1B
+python -m raft_ann_bench.plot --dataset deep-1B
 ```
 
-The usage of `python -m raft-ann-bench.split_groundtruth` is:
+The usage of `python -m raft_ann_bench.split_groundtruth` is:
 ```bash
 usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH
 
@@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -355,7 +355,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.04a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:24.06a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -389,13 +389,13 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
 
 ```
-(base) root@00b068fbb862:/data/benchmarks# python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
+(base) root@00b068fbb862:/data/benchmarks# python -m raft_ann_bench.get_dataset --dataset deep-image-96-angular --normalize
 ```
 
 Additionally, the containers can be run in detached mode without any issue.
@@ -441,7 +441,7 @@ Note the following:
 
 A single configuration will often define a set of algorithms, with associated index and search parameters, that can be generalize across datasets. We use YAML to define dataset specific and algorithm specific configurations.
 
-<a id='yaml-dataset-config'></a>A default `datasets.yaml` is provided by RAFT in `${RAFT_HOME}/python/raft-ann-bench/src/raft-ann-bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
+<a id='yaml-dataset-config'></a>A default `datasets.yaml` is provided by RAFT in `${RAFT_HOME}/python/raft-ann-bench/src/raft_ann_bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
 
 ```yaml
 - name: sift-128-euclidean
@@ -452,7 +452,7 @@ A single configuration will often define a set of algorithms, with associated in
   distance: euclidean
 ```
 
-<a id='yaml-algo-config'></a>Configuration files for ANN algorithms supported by `raft-ann-bench` are provided in `${RAFT_HOME}/python/raft-ann-bench/src/raft-ann-bench/run/conf`. `raft_cagra` algorithm configuration looks like:
+<a id='yaml-algo-config'></a>Configuration files for ANN algorithms supported by `raft-ann-bench` are provided in `${RAFT_HOME}/python/raft-ann-bench/src/raft_ann_bench/run/conf`. `raft_cagra` algorithm configuration looks like:
 ```yaml
 name: raft_cagra
 groups:
diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb
index b3a15d3a08..33a2f60228 100644
--- a/notebooks/VectorSearch_QuestionRetrieval.ipynb
+++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "eb1e81c3",
    "metadata": {},
    "outputs": [
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "ee4c5cc0",
    "metadata": {},
    "outputs": [
@@ -184,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "0a1a6307",
    "metadata": {},
    "outputs": [
@@ -249,7 +249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "ad90b4be",
    "metadata": {},
    "outputs": [
@@ -292,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "724dcacb",
    "metadata": {
     "scrolled": true
@@ -320,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "c27d4715",
    "metadata": {},
    "outputs": [
@@ -347,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "bc375518",
    "metadata": {},
    "outputs": [
@@ -373,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "ab154181",
    "metadata": {},
    "outputs": [
@@ -399,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "2d6017ed",
    "metadata": {},
    "outputs": [
@@ -435,7 +435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "f5cfb644",
    "metadata": {},
    "outputs": [
@@ -462,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "b5694d00",
    "metadata": {},
    "outputs": [
@@ -489,7 +489,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "fcfc3c5b",
    "metadata": {},
    "outputs": [
@@ -528,7 +528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "50df1f43-c580-4019-949a-06bdc7185536",
    "metadata": {},
    "outputs": [],
@@ -538,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "091cde52-4652-4230-af2b-75c35357f833",
    "metadata": {},
    "outputs": [
@@ -546,21 +546,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1min 23s, sys: 2min 7s, total: 3min 31s\n",
-      "Wall time: 4min 43s\n"
+      "CPU times: user 35.3 s, sys: 4.5 s, total: 39.8 s\n",
+      "Wall time: 2.16 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "params = cagra.IndexParams(intermediate_graph_degree=128, graph_degree=64)\n",
+    "params = cagra.IndexParams(intermediate_graph_degree=32, graph_degree=16, build_algo=\"nn_descent\")\n",
     "cagra_index = cagra.build(params, corpus_embeddings)\n",
-    "search_params = cagra.SearchParams()"
+    "search_params = cagra.SearchParams(algo=\"multi_cta\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "id": "df229e21-f6b6-4d6c-ad54-2724f8738934",
    "metadata": {},
    "outputs": [],
@@ -569,9 +569,12 @@
     "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
     "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
     "\n",
+    "    start_time = time.time()\n",
     "    hits = cagra.search(search_params, cagra_index, question_embedding[None], top_k)\n",
+    "    end_time = time.time()\n",
     "\n",
     "    # Output of top-k hits\n",
+    "    print(\"Results (after {:.3f} seconds):\".format(end_time - start_time))\n",
     "    print(\"Input question:\", query)\n",
     "    for k in range(top_k):\n",
     "        print(\"\\t{:.3f}\\t{}\".format(hits[0][0, k], passages[hits[1][0, k]]))"
@@ -587,19 +590,20 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 16 µs, sys: 25 µs, total: 41 µs\n",
-      "Wall time: 83.7 µs\n",
+      "Results (after 0.005 seconds):\n",
       "Input question: Who was Grace Hopper?\n",
       "\t181.649\t['Grace Hopper', 'Hopper was born in New York, USA. Hopper graduated from Vassar College in 1928 and Yale University in 1934 with a Ph.D degree in mathematics. She joined the US Navy during the World War II in 1943. She worked on computers in the Navy for 43 years. She then worked in other private industry companies after 1949. She retired from the Navy in 1986 and died on January 1, 1992.']\n",
       "\t192.946\t['Leona Helmsley', 'Leona Helmsley (July 4, 1920 – August 20, 2007) was an American businesswoman. She was known for having a flamboyant personality. She had a reputation for tyrannical behavior; she was nicknamed the Queen of Mean.']\n",
       "\t194.951\t['Grace Hopper', 'Grace Murray Hopper (December 9 1906 – January 1 1992) was an American computer scientist and United States Navy officer.']\n",
       "\t202.192\t['Nellie Bly', 'Elizabeth Cochrane Seaman (born Elizabeth Jane Cochran; May 5, 1864 – January 27, 1922), better known by her pen name Nellie Bly, was an American journalist, novelist and inventor. She was a newspaper reporter, who worked at various jobs for exposing poor working conditions. Nellie Bly, also, fought for women\\'s right and was known for investigative reporting. She best known for her record-breaking trip around the world in 72 days, inspired by the adventure novel \"Around the World in Eighty Days\" by Jules Verne. In the 1880s, she went undercover as a mentally ill patient in a psychiatric hospital for ten days, with the report being made public in a book called \"\"Ten Days in a Mad-House\"\". She was added to the National Women\\'s Hall of Fame in 1998.']\n",
-      "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n"
+      "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n",
+      "CPU times: user 4.18 ms, sys: 3.88 ms, total: 8.07 ms\n",
+      "Wall time: 9.97 ms\n"
      ]
     }
    ],
    "source": [
-    "%time \n",
+    "%%time \n",
     "search_raft_cagra(query=\"Who was Grace Hopper?\")"
    ]
   }
@@ -620,7 +624,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index 3b67a5f951..b0869501f3 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/pylibraft/pylibraft/cluster/CMakeLists.txt b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
index 7d6e05d918..562cff5098 100644
--- a/python/pylibraft/pylibraft/cluster/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index 6ce1dfe347..53279bfaf7 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx
index c1a9188585..434eb59752 100644
--- a/python/pylibraft/pylibraft/common/mdspan.pyx
+++ b/python/pylibraft/pylibraft/common/mdspan.pyx
@@ -22,6 +22,7 @@ import io
 
 import numpy as np
 
+from cpython.buffer cimport PyBUF_FULL_RO, PyBuffer_Release, PyObject_GetBuffer
 from cpython.object cimport PyObject
 from cython.operator cimport dereference as deref
 from libc.stddef cimport size_t
@@ -47,10 +48,6 @@ from pylibraft.common.optional cimport make_optional, optional
 from pylibraft.common import DeviceResources
 
 
-cdef extern from "Python.h":
-    Py_buffer* PyMemoryView_GET_BUFFER(PyObject* mview)
-
-
 def run_roundtrip_test_for_mdspan(X, fortran_order=False):
     if not isinstance(X, np.ndarray) or len(X.shape) != 2:
         raise ValueError("Please call this function with a NumPy array with"
@@ -59,6 +56,9 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
     cdef device_resources * handle_ = \
         <device_resources *> <size_t> handle.getHandle()
     cdef ostringstream oss
+    cdef Py_buffer buf
+    PyObject_GetBuffer(X, &buf, PyBUF_FULL_RO)
+    cdef uintptr_t buf_ptr = <uintptr_t>buf.buf
     if X.dtype == np.float32:
         if fortran_order:
             serialize_mdspan[float, matrix_extent[size_t], col_major](
@@ -67,8 +67,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[float, matrix_extent[size_t],
                                    col_major] &>
                 make_host_matrix_view[float, size_t, col_major](
-                    <float *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <float *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[float, matrix_extent[size_t], row_major](
@@ -77,8 +76,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[float, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[float, size_t, row_major](
-                    <float *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <float *>buf_ptr,
                     X.shape[0], X.shape[1]))
     elif X.dtype == np.float64:
         if fortran_order:
@@ -88,8 +86,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[double, matrix_extent[size_t],
                                    col_major]&>
                 make_host_matrix_view[double, size_t, col_major](
-                    <double *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <double *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[double, matrix_extent[size_t], row_major](
@@ -98,8 +95,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[double, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[double, size_t, row_major](
-                    <double *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <double *>buf_ptr,
                     X.shape[0], X.shape[1]))
     elif X.dtype == np.int32:
         if fortran_order:
@@ -109,8 +105,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[int32_t, matrix_extent[size_t],
                                    col_major]&>
                 make_host_matrix_view[int32_t, size_t, col_major](
-                    <int32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <int32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[int32_t, matrix_extent[size_t], row_major](
@@ -119,8 +114,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[int32_t, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[int32_t, size_t, row_major](
-                    <int32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <int32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
     elif X.dtype == np.uint32:
         if fortran_order:
@@ -130,8 +124,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[uint32_t, matrix_extent[size_t],
                                    col_major]&>
                 make_host_matrix_view[uint32_t, size_t, col_major](
-                    <uint32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <uint32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[uint32_t, matrix_extent[size_t], row_major](
@@ -140,11 +133,12 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[uint32_t, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[uint32_t, size_t, row_major](
-                    <uint32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <uint32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
     else:
+        PyBuffer_Release(&buf)
         raise NotImplementedError()
+    PyBuffer_Release(&buf)
     f = io.BytesIO(oss.str())
     X2 = np.load(f)
     assert np.all(X.shape == X2.shape)
diff --git a/python/pylibraft/pylibraft/matrix/CMakeLists.txt b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
index ffba10dea9..5b7803db00 100644
--- a/python/pylibraft/pylibraft/matrix/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
index 441bb0b311..0939d7c5b3 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
index df31d2560b..0e488a51ca 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
@@ -97,9 +97,11 @@ cdef class IndexParams:
     Parameters
     ----------
     metric : string denoting the metric type, default="sqeuclidean"
-        Valid values for metric: ["sqeuclidean"], where
+        Valid values for metric: ["sqeuclidean", "inner_product"], where
             - sqeuclidean is the euclidean distance without the square root
               operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2
+            - inner_product is the dot product between two vectors i.e.:
+              distance(a, b) = \\sum_i (a_i * b_i)
     intermediate_graph_degree : int, default = 128
 
     graph_degree : int, default = 64
@@ -355,6 +357,7 @@ def build(IndexParams index_params, dataset, handle=None):
 
     The following distance metrics are supported:
         - L2
+        - inner_product
 
     Parameters
     ----------
diff --git a/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd b/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd
index 75c0c14aad..7b2cf59c81 100644
--- a/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd
+++ b/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd
@@ -75,19 +75,7 @@ cdef extern from "raft_runtime/neighbors/hnsw.hpp" \
         host_matrix_view[uint64_t, int64_t, row_major] neighbors,
         host_matrix_view[float, int64_t, row_major] distances) except +
 
-    cdef unique_ptr[index[float]] deserialize_file[float](
-        const device_resources& handle,
-        const string& filename,
-        int dim,
-        DistanceType metric) except +
-
-    cdef unique_ptr[index[int8_t]] deserialize_file[int8_t](
-        const device_resources& handle,
-        const string& filename,
-        int dim,
-        DistanceType metric) except +
-
-    cdef unique_ptr[index[uint8_t]] deserialize_file[uint8_t](
+    cdef unique_ptr[index[T]] deserialize_file[T](
         const device_resources& handle,
         const string& filename,
         int dim,
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
index 8f395faec9..37c57c45db 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
index e3d721a6ea..af431adb16 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index fcc5ee6311..10ff776471 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/pylibraft/pylibraft/test/test_cagra.py b/python/pylibraft/pylibraft/test/test_cagra.py
index be53b33da3..ef8e54917a 100644
--- a/python/pylibraft/pylibraft/test/test_cagra.py
+++ b/python/pylibraft/pylibraft/test/test_cagra.py
@@ -29,7 +29,7 @@ def run_cagra_build_search_test(
     n_queries=100,
     k=10,
     dtype=np.float32,
-    metric="euclidean",
+    metric="sqeuclidean",
     intermediate_graph_degree=128,
     graph_degree=64,
     build_algo="ivf_pq",
@@ -143,7 +143,7 @@ def test_cagra_dataset_dtype_host_device(
             "graph_degree": 32,
             "add_data_on_build": True,
             "k": 1,
-            "metric": "euclidean",
+            "metric": "sqeuclidean",
             "build_algo": "ivf_pq",
         },
         {
@@ -159,7 +159,7 @@ def test_cagra_dataset_dtype_host_device(
             "graph_degree": 32,
             "add_data_on_build": True,
             "k": 10,
-            "metric": "inner_product",
+            "metric": "sqeuclidean",
             "build_algo": "nn_descent",
         },
     ],
diff --git a/python/pylibraft/pylibraft/test/test_hnsw.py b/python/pylibraft/pylibraft/test/test_hnsw.py
index 487f190e4e..8cdf8c904f 100644
--- a/python/pylibraft/pylibraft/test/test_hnsw.py
+++ b/python/pylibraft/pylibraft/test/test_hnsw.py
@@ -29,6 +29,7 @@ def run_hnsw_build_search_test(
     k=10,
     dtype=np.float32,
     metric="sqeuclidean",
+    build_algo="ivf_pq",
     intermediate_graph_degree=128,
     graph_degree=64,
     search_params={},
@@ -36,11 +37,18 @@ def run_hnsw_build_search_test(
     dataset = generate_data((n_rows, n_cols), dtype)
     if metric == "inner_product":
         dataset = normalize(dataset, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip(
+                "inner_product metric is not supported for int8/uint8 data"
+            )
+        if build_algo == "nn_descent":
+            pytest.skip("inner_product metric is not supported for nn_descent")
 
     build_params = cagra.IndexParams(
         metric=metric,
         intermediate_graph_degree=intermediate_graph_degree,
         graph_degree=graph_degree,
+        build_algo=build_algo,
     )
 
     index = cagra.build(build_params, dataset)
@@ -57,7 +65,14 @@ def run_hnsw_build_search_test(
     out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k)
 
     # Calculate reference values with sklearn
-    nn_skl = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
     nn_skl.fit(dataset)
     skl_idx = nn_skl.kneighbors(queries, return_distance=False)
 
@@ -69,9 +84,15 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("k", [10, 20])
 @pytest.mark.parametrize("ef", [30, 40])
 @pytest.mark.parametrize("num_threads", [2, 4])
-def test_hnsw(dtype, k, ef, num_threads):
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
     run_hnsw_build_search_test(
-        dtype=dtype, k=k, search_params={"ef": ef, "num_threads": num_threads}
+        dtype=dtype,
+        k=k,
+        metric=metric,
+        build_algo=build_algo,
+        search_params={"ef": ef, "num_threads": num_threads},
     )
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index d687f70cf5..df1001538e 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,7 +19,7 @@ requires = [
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
     "ninja",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
@@ -37,7 +37,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "numpy>=1.23,<2.0a0",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -116,3 +116,8 @@ wheel.packages = ["pylibraft"]
 provider = "scikit_build_core.metadata.regex"
 input = "pylibraft/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error",
+]
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index 4a185b22ca..e1f0e18304 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "raft-ann-bench"
-version = "24.04.00"
+dynamic = ["version"]
 description = "RAFT ANN benchmarks"
 authors = [
     { name = "NVIDIA Corporation" },
@@ -26,6 +26,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
 ]
 
 [project.urls]
@@ -35,7 +36,7 @@ Homepage = "https://github.com/rapidsai/raft"
 where = ["src"]
 
 [tool.setuptools.package-data]
-"*" = ["*.*"]
+"*" = ["*.*", "VERSION"]
 
 [tool.isort]
 line_length = 79
@@ -57,3 +58,6 @@ skip = [
     "build",
     "dist",
 ]
+
+[tool.setuptools.dynamic]
+version = { file = "raft_ann_bench/VERSION" }
diff --git a/python/raft-ann-bench/src/raft_ann_bench/VERSION b/python/raft-ann-bench/src/raft_ann_bench/VERSION
new file mode 120000
index 0000000000..a4e948506b
--- /dev/null
+++ b/python/raft-ann-bench/src/raft_ann_bench/VERSION
@@ -0,0 +1 @@
+../../../../VERSION
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/__init__.py b/python/raft-ann-bench/src/raft_ann_bench/__init__.py
similarity index 84%
rename from python/raft-ann-bench/src/raft-ann-bench/__init__.py
rename to python/raft-ann-bench/src/raft_ann_bench/__init__.py
index 8f2cc34855..80a3b3f284 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/__init__.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from ._version import __git_commit__, __version__
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/python/raft-ann-bench/src/raft_ann_bench/_version.py
similarity index 60%
rename from cpp/cmake/thirdparty/get_gtest.cmake
rename to python/raft-ann-bench/src/raft_ann_bench/_version.py
index 7efad7886c..394acd755d 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/python/raft-ann-bench/src/raft_ann_bench/_version.py
@@ -1,5 +1,4 @@
-#=============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#=============================================================================
+#
+
 
-function(find_and_configure_gtest )
-    include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest()
-endfunction()
+import importlib.resources
 
-find_and_configure_gtest()
+__version__ = (
+    importlib.resources.files("raft_ann_bench")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/raft-ann-bench/src/raft-ann-bench/constraints/__init__.py b/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/constraints/__init__.py
rename to python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/data_export/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/data_export/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/__main__.py
similarity index 97%
rename from python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/__main__.py
index a5ebb76635..e6f7aaf99c 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/__main__.py
@@ -96,16 +96,16 @@ def main():
         "The input and output files are in big-ann-benchmark's binary format.",
         epilog="""Example usage
     # With existing query file
-    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+    python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.\
 fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
 
     # With randomly generated queries
-    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+    python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.\
 fbin --output=groundtruth_dir --queries=random --n_queries=10000
 
     # Using only a subset of the dataset. Define queries by randomly
     # selecting vectors from the (subset of the) dataset.
-    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+    python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.\
 fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
 --queries=random-choice --n_queries=10000
     """,
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/utils.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py
rename to python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/utils.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/get_dataset/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/get_dataset/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py b/python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py
rename to python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/hdf5_to_fbin.py b/python/raft-ann-bench/src/raft_ann_bench/get_dataset/hdf5_to_fbin.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/get_dataset/hdf5_to_fbin.py
rename to python/raft-ann-bench/src/raft_ann_bench/get_dataset/hdf5_to_fbin.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/plot/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/plot/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/run/__main__.py
similarity index 98%
rename from python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/run/__main__.py
index 52d536c2e8..c34377d733 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/__main__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -553,10 +553,14 @@ def add_algo_group(group_list):
                             index["build_param"], conf_file["dataset"]["dims"]
                         ):
                             continue
-
+                index_filename = (
+                    index_name
+                    if len(index_name) < 128
+                    else str(hash(index_name))
+                )
                 index["name"] = index_name
                 index["file"] = os.path.join(
-                    args.dataset_path, args.dataset, "index", index_name
+                    args.dataset_path, args.dataset, "index", index_filename
                 )
                 index["search_params"] = []
                 all_search_params = itertools.product(*search_param_lists)
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/algos.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/algos.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/hnswlib.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
similarity index 74%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/hnswlib.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
index 9268c4cb08..e7a4e6b506 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/hnswlib.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
@@ -1,6 +1,6 @@
 name: hnswlib
 constraints:
-  search: raft-ann-bench.constraints.hnswlib_search_constraints
+  search: raft_ann_bench.constraints.hnswlib_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_brute_force.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_brute_force.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
similarity index 68%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
index 374458989a..bb66b4b232 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
@@ -1,7 +1,7 @@
 name: raft_cagra
 constraints:
-  build: raft-ann-bench.constraints.raft_cagra_build_constraints
-  search: raft-ann-bench.constraints.raft_cagra_search_constraints
+  build: raft_ann_bench.constraints.raft_cagra_build_constraints
+  search: raft_ann_bench.constraints.raft_cagra_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
similarity index 80%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
index 787675d65d..3ac2d16b68 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
@@ -1,6 +1,6 @@
 name: raft_cagra_hnswlib
 constraints:
-  search: raft-ann-bench.constraints.hnswlib_search_constraints
+  search: raft_ann_bench.constraints.hnswlib_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
similarity index 73%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
index fac383119a..7eaec2b77b 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
@@ -1,7 +1,7 @@
 name: raft_ivf_pq
 constraints:
-  build: raft-ann-bench.constraints.raft_ivf_pq_build_constraints
-  search: raft-ann-bench.constraints.raft_ivf_pq_search_constraints
+  build: raft_ann_bench.constraints.raft_ivf_pq_build_constraints
+  search: raft_ann_bench.constraints.raft_ivf_pq_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/bigann-100M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/bigann-100M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/datasets.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/datasets.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-100M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-100M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-1B.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-1B.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-1B.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-1B.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-image-96-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-image-96-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/fashion-mnist-784-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/fashion-mnist-784-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/gist-960-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/gist-960-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/lastfm-65-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/lastfm-65-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/mnist-784-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/mnist-784-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/sift-128-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/sift-128-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_10M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_10M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_10M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_10M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_1M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_1M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_88M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_88M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_88M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_88M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl b/python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl
rename to python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 58e5ae8104..2c629f3b73 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -15,6 +15,7 @@
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../rapids_config.cmake)
+include(rapids-cpm)
 include(rapids-cuda)
 rapids_cuda_init_architectures(raft-dask-python)
 
@@ -28,6 +29,11 @@ option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulti
        OFF
 )
 
+rapids_cpm_init()
+# Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can remove this.
+find_package(ucx REQUIRED)
+include(cmake/thirdparty/get_ucxx.cmake)
+
 # If the user requested it we attempt to find RAFT.
 if(FIND_RAFT_CPP)
   find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
@@ -36,8 +42,6 @@ else()
 endif()
 
 if(NOT raft_FOUND)
-  find_package(ucx REQUIRED)
-
   # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
   # library compilation and we don't need to install anything here.
   set(BUILD_TESTS OFF)
@@ -47,6 +51,7 @@ if(NOT raft_FOUND)
   set(RAFT_COMPILE_DIST_LIBRARY OFF)
   set(RAFT_COMPILE_NN_LIBRARY OFF)
   set(CUDA_STATIC_RUNTIME ON)
+  set(RAFT_DASK_UCXX_STATIC ON)
 
   add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
   list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
new file mode 100644
index 0000000000..8e340eec73
--- /dev/null
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -0,0 +1,55 @@
+#=============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ucxx)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    set(options UCXX_STATIC)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set(BUILD_UCXX_SHARED ON)
+    if(PKG_UCXX_STATIC)
+      set(BUILD_UCXX_SHARED OFF)
+    endif()
+
+    rapids_cpm_find(ucxx ${PKG_VERSION}
+            GLOBAL_TARGETS         ucxx::ucxx ucxx::python
+            BUILD_EXPORT_SET       raft-distributed-exports
+            INSTALL_EXPORT_SET     raft-distributed-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/ucxx.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR          cpp
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            OPTIONS
+              "BUILD_TESTS OFF"
+              "BUILD_BENCH OFF"
+              "UCXX_ENABLE_PYTHON ON"
+              "UCXX_ENABLE_RMM ON"
+              "BUILD_SHARED_LIBS ${BUILD_UCXX_SHARED}"
+        )
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_ucxx(VERSION  0.38
+        FORK             rapidsai
+        PINNED_TAG       branch-0.38
+        EXCLUDE_FROM_ALL YES
+        UCXX_STATIC      ${RAFT_DASK_UCXX_STATIC}
+    )
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 07e2463c5c..e8ded4cd4a 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -18,6 +18,7 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
+    "libucx==1.15.0",
     "ninja",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -33,13 +34,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==24.4.*",
+    "dask-cuda==24.6.*",
+    "distributed-ucxx==0.38.*",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibraft==24.4.*",
-    "rapids-dask-dependency==24.4.*",
-    "ucx-py==0.37.*",
+    "pylibraft==24.6.*",
+    "rapids-dask-dependency==24.6.*",
+    "ucx-py==0.38.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index 8904172272..2467e2089a 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -1,4 +1,6 @@
 [pytest]
+filterwarnings =
+    error
 markers =
   unit: marks unit tests
   quality: marks quality tests
@@ -6,3 +8,5 @@ markers =
   mg: marks a test as multi-GPU
   memleak: marks a test as a memory leak test
   nccl: marks a test as using NCCL
+  ucx: marks a test as using UCX-Py
+  ucxx: marks a test as using UCXX
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index fbbaee4118..19a037ae75 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,3 +14,13 @@
 #
 
 from raft_dask._version import __git_commit__, __version__
+
+# If libucx was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libucx
+except ModuleNotFoundError:
+    pass
+else:
+    libucx.load_library()
+    del libucx
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 3798b5ac4b..65d5f06577 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/raft-dask/raft_dask/common/comms.py b/python/raft-dask/raft_dask/common/comms.py
index b2f7d1fb74..c67170342f 100644
--- a/python/raft-dask/raft_dask/common/comms.py
+++ b/python/raft-dask/raft_dask/common/comms.py
@@ -327,11 +327,15 @@ def get_ucx(dask_worker=None):
                   (Note: if called by client.run(), this is supplied by Dask
                    and not the client)
     """
+    protocol = (
+        "ucxx" if dask_worker._protocol.split("://")[0] == "ucxx" else "ucx"
+    )
+
     raft_comm_state = get_raft_comm_state(
         sessionId="ucp", state_object=dask_worker
     )
     if "ucx" not in raft_comm_state:
-        raft_comm_state["ucx"] = UCX.get()
+        raft_comm_state["ucx"] = UCX.get(protocol=protocol)
 
     return raft_comm_state["ucx"]
 
@@ -535,7 +539,9 @@ def _func_build_handle_p2p(
     if verbose:
         dask_worker.log_event(topic="info", msg="Building p2p handle.")
 
-    ucp_worker = get_ucx(dask_worker).get_worker()
+    ucx = get_ucx(dask_worker)
+    is_ucxx = ucx._protocol == "ucxx"
+    ucx_worker = ucx.get_worker()
     raft_comm_state = get_raft_comm_state(
         sessionId=sessionId, state_object=dask_worker
     )
@@ -550,7 +556,14 @@ def _func_build_handle_p2p(
         dask_worker.log_event(topic="info", msg="Injecting comms on handle.")
 
     inject_comms_on_handle(
-        handle, nccl_comm, ucp_worker, eps, nWorkers, workerId, verbose
+        handle,
+        nccl_comm,
+        is_ucxx,
+        ucx_worker,
+        eps,
+        nWorkers,
+        workerId,
+        verbose,
     )
 
     if verbose:
diff --git a/python/raft-dask/raft_dask/common/comms_utils.pyx b/python/raft-dask/raft_dask/common/comms_utils.pyx
index 768ba0e422..2d4d2cc83b 100644
--- a/python/raft-dask/raft_dask/common/comms_utils.pyx
+++ b/python/raft-dask/raft_dask/common/comms_utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,6 +41,7 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
 
     void build_comms_nccl_ucx(device_resources *handle,
                               ncclComm_t comm,
+                              bint is_ucxx,
                               void *ucp_worker,
                               void *eps,
                               int size,
@@ -285,7 +286,7 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose):
                           rank)
 
 
-def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
+def inject_comms_on_handle(handle, nccl_inst, is_ucxx, ucp_worker, eps, size,
                            rank, verbose):
     """
     Given a handle and initialized comms, creates a comms_t instance
@@ -308,7 +309,10 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
 
     for i in range(len(eps)):
         if eps[i] is not None:
-            ep_st = <uintptr_t>eps[i].get_ucp_endpoint()
+            if is_ucxx:
+                ep_st = <uintptr_t>eps[i].ucxx_endpoint
+            else:
+                ep_st = <uintptr_t>eps[i].get_ucp_endpoint()
             ucp_eps[i] = <size_t>ep_st
         else:
             ucp_eps[i] = 0
@@ -323,6 +327,7 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
 
     build_comms_nccl_ucx(handle_,
                          deref(nccl_comm_),
+                         is_ucxx,
                          <void*>ucp_worker_st,
                          <void*>ucp_eps,
                          size,
diff --git a/python/raft-dask/raft_dask/common/ucx.py b/python/raft-dask/raft_dask/common/ucx.py
index eb246853f4..423e6f4692 100644
--- a/python/raft-dask/raft_dask/common/ucx.py
+++ b/python/raft-dask/raft_dask/common/ucx.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +13,6 @@
 # limitations under the License.
 #
 
-import ucp
-
 
 async def _connection_func(ep):
     UCX.get().add_server_endpoint(ep)
@@ -29,10 +27,20 @@ class UCX:
 
     __instance = None
 
-    def __init__(self, listener_callback):
+    def __init__(self, listener_callback, protocol):
 
         self.listener_callback = listener_callback
 
+        self._protocol = protocol
+        if self._protocol == "ucxx":
+            import ucxx
+
+            self.ucx_api = ucxx
+        else:
+            import ucp
+
+            self.ucx_api = ucp
+
         self._create_listener()
         self._endpoints = {}
         self._server_endpoints = []
@@ -42,22 +50,28 @@ def __init__(self, listener_callback):
         UCX.__instance = self
 
     @staticmethod
-    def get(listener_callback=_connection_func):
+    def get(listener_callback=_connection_func, protocol="ucx"):
         if UCX.__instance is None:
-            UCX(listener_callback)
+            UCX(listener_callback, protocol)
         return UCX.__instance
 
+    def get_protocol(self):
+        return self._protocol
+
     def get_worker(self):
-        return ucp.get_ucp_worker()
+        if self._protocol == "ucxx":
+            return self.ucx_api.get_ucxx_worker()
+        else:
+            return self.ucx_api.get_ucp_worker()
 
     def _create_listener(self):
-        self._listener = ucp.create_listener(self.listener_callback)
+        self._listener = self.ucx_api.create_listener(self.listener_callback)
 
     def listener_port(self):
         return self._listener.port
 
     async def _create_endpoint(self, ip, port):
-        ep = await ucp.create_endpoint(ip, port)
+        ep = await self.ucx_api.create_endpoint(ip, port)
         self._endpoints[(ip, port)] = ep
         return ep
 
diff --git a/python/raft-dask/raft_dask/include_test/CMakeLists.txt b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
index e588ce1d1e..2ff1cd9150 100644
--- a/python/raft-dask/raft_dask/include_test/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/python/raft-dask/raft_dask/test/conftest.py b/python/raft-dask/raft_dask/test/conftest.py
index d1baa684d4..a60e4d995f 100644
--- a/python/raft-dask/raft_dask/test/conftest.py
+++ b/python/raft-dask/raft_dask/test/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import os
 
@@ -34,6 +34,21 @@ def ucx_cluster():
         cluster.close()
 
 
+@pytest.fixture(scope="session")
+def ucxx_cluster():
+    pytest.importorskip("distributed_ucxx")
+
+    scheduler_file = os.environ.get("SCHEDULER_FILE")
+    if scheduler_file:
+        yield scheduler_file
+    else:
+        cluster = LocalCUDACluster(
+            protocol="ucxx",
+        )
+        yield cluster
+        cluster.close()
+
+
 @pytest.fixture(scope="session")
 def client(cluster):
     client = create_client(cluster)
@@ -48,6 +63,13 @@ def ucx_client(ucx_cluster):
     client.close()
 
 
+@pytest.fixture()
+def ucxx_client(ucxx_cluster):
+    client = create_client(ucxx_cluster)
+    yield client
+    client.close()
+
+
 def create_client(cluster):
     """
     Create a Dask distributed client for a specified cluster.
@@ -69,3 +91,43 @@ def create_client(cluster):
         return Client(cluster)
     else:
         return Client(scheduler_file=cluster)
+
+
+def pytest_addoption(parser):
+    group = parser.getgroup("Dask RAFT Custom Options")
+
+    group.addoption(
+        "--run_ucx", action="store_true", help="run _only_ UCX-Py tests"
+    )
+
+    group.addoption(
+        "--run_ucxx", action="store_true", help="run _only_ UCXX tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--run_ucx"):
+        skip_others = pytest.mark.skip(
+            reason="only runs when --run_ucx is not specified"
+        )
+        for item in items:
+            if "ucx" not in item.keywords:
+                item.add_marker(skip_others)
+    else:
+        skip_ucx = pytest.mark.skip(reason="requires --run_ucx to run")
+        for item in items:
+            if "ucx" in item.keywords:
+                item.add_marker(skip_ucx)
+
+    if config.getoption("--run_ucxx"):
+        skip_others = pytest.mark.skip(
+            reason="only runs when --run_ucxx is not specified"
+        )
+        for item in items:
+            if "ucxx" not in item.keywords:
+                item.add_marker(skip_others)
+    else:
+        skip_ucxx = pytest.mark.skip(reason="requires --run_ucxx to run")
+        for item in items:
+            if "ucxx" in item.keywords:
+                item.add_marker(skip_ucxx)
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/test/test_comms.py
index b62d7185b2..109dd12b5e 100644
--- a/python/raft-dask/raft_dask/test/test_comms.py
+++ b/python/raft-dask/raft_dask/test/test_comms.py
@@ -66,6 +66,10 @@ def create_client(cluster):
         return Client(scheduler_file=cluster)
 
 
+def _get_client(dask_client, request):
+    return request.getfixturevalue(dask_client)
+
+
 def test_comms_init_no_p2p(cluster):
     client = create_client(cluster)
     try:
@@ -179,8 +183,7 @@ def _has_handle(sessionId):
     functions = [None]
 
 
-@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
-def test_nccl_root_placement(client, root_location):
+def _test_nccl_root_placement(client, root_location):
 
     cb = None
     try:
@@ -214,10 +217,31 @@ def test_nccl_root_placement(client, root_location):
             cb.destroy()
 
 
-@pytest.mark.parametrize("func", functions)
 @pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
 @pytest.mark.nccl
-def test_collectives(client, func, root_location):
+def test_nccl_root_placement(root_location, request):
+    _test_nccl_root_placement(_get_client("client", request), root_location)
+
+
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucx
+def test_nccl_root_placement_ucx(root_location, request):
+    _test_nccl_root_placement(
+        _get_client("ucx_client", request), root_location
+    )
+
+
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_nccl_root_placement_ucxx(root_location, request):
+    _test_nccl_root_placement(
+        _get_client("ucxx_client", request), root_location
+    )
+
+
+def _test_collectives(client, func, root_location):
 
     try:
         cb = Comms(
@@ -246,8 +270,30 @@ def test_collectives(client, func, root_location):
             cb.destroy()
 
 
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
 @pytest.mark.nccl
-def test_comm_split(client):
+def test_collectives(func, root_location, request):
+    _test_collectives(_get_client("client", request), func, root_location)
+
+
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucx
+def test_collectives_ucx(func, root_location, request):
+    _test_collectives(_get_client("ucx_client", request), func, root_location)
+
+
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_collectives_ucxx(func, root_location, request):
+    _test_collectives(_get_client("ucxx_client", request), func, root_location)
+
+
+def _test_comm_split(client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -264,9 +310,24 @@ def test_comm_split(client):
     assert all([x.result() for x in dfs])
 
 
+@pytest.mark.nccl
+def test_comm_split(request):
+    _test_comm_split(_get_client("client", request))
+
+
+@pytest.mark.nccl
 @pytest.mark.ucx
-@pytest.mark.parametrize("n_trials", [1, 5])
-def test_send_recv(n_trials, client):
+def test_comm_split_ucx(request):
+    _test_comm_split(_get_client("ucx_client", request))
+
+
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_comm_split_ucxx(request):
+    _test_comm_split(_get_client("ucxx_client", request))
+
+
+def _test_send_recv_protocol(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -287,9 +348,24 @@ def test_send_recv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
-@pytest.mark.nccl
 @pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_send_or_recv(n_trials, client):
+def test_send_recv_protocol(n_trials, request):
+    _test_send_recv_protocol(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.ucx
+def test_send_recv_protocol_ucx(n_trials, request):
+    _test_send_recv_protocol(n_trials, _get_client("ucx_client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.ucxx
+def test_send_recv_protocol_ucxx(n_trials, request):
+    _test_send_recv_protocol(n_trials, _get_client("ucxx_client", request))
+
+
+def _test_device_send_or_recv(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -310,9 +386,27 @@ def test_device_send_or_recv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+def test_device_send_or_recv(n_trials, request):
+    _test_device_send_or_recv(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
 @pytest.mark.nccl
+@pytest.mark.ucx
+def test_device_send_or_recv_ucx(n_trials, request):
+    _test_device_send_or_recv(n_trials, _get_client("ucx_client", request))
+
+
 @pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_sendrecv(n_trials, client):
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_device_send_or_recv_ucxx(n_trials, request):
+    _test_device_send_or_recv(n_trials, _get_client("ucxx_client", request))
+
+
+def _test_device_sendrecv(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -333,9 +427,27 @@ def test_device_sendrecv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+def test_device_sendrecv(n_trials, request):
+    _test_device_sendrecv(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
 @pytest.mark.nccl
+@pytest.mark.ucx
+def test_device_sendrecv_ucx(n_trials, request):
+    _test_device_sendrecv(n_trials, _get_client("ucx_client", request))
+
+
 @pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_multicast_sendrecv(n_trials, client):
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_device_sendrecv_ucxx(n_trials, request):
+    _test_device_sendrecv(n_trials, _get_client("ucxx_client", request))
+
+
+def _test_device_multicast_sendrecv(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -356,6 +468,30 @@ def test_device_multicast_sendrecv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+def test_device_multicast_sendrecv(n_trials, request):
+    _test_device_multicast_sendrecv(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+@pytest.mark.ucx
+def test_device_multicast_sendrecv_ucx(n_trials, request):
+    _test_device_multicast_sendrecv(
+        n_trials, _get_client("ucx_client", request)
+    )
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_device_multicast_sendrecv_ucxx(n_trials, request):
+    _test_device_multicast_sendrecv(
+        n_trials, _get_client("ucxx_client", request)
+    )
+
+
 @pytest.mark.nccl
 @pytest.mark.parametrize(
     "subset", [slice(-1, None), slice(1), slice(None, None, -2)]