Merge branch 'branch-25.04' into batchnnd-dedup

rapidsai · Feb 25, 2025 · f286983 · f286983
2 parents 312913c + cb6fe7c
commit f286983
Show file tree

Hide file tree

Showing 35 changed files with 458 additions and 70 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -5,20 +5,21 @@ cpp/               @rapidsai/raft-cpp-codeowners
 python/            @rapidsai/raft-python-codeowners
 
 #cmake code owners
-**/CMakeLists.txt  @rapidsai/raft-cmake-codeowners
+CMakeLists.txt     @rapidsai/raft-cmake-codeowners
 **/cmake/          @rapidsai/raft-cmake-codeowners
+*.cmake            @rapidsai/raft-cmake-codeowners
 python/setup.py    @rapidsai/raft-cmake-codeowners
 build.sh           @rapidsai/raft-cmake-codeowners
 **/build.sh        @rapidsai/raft-cmake-codeowners
 
 #CI code owners
 /.github/                @rapidsai/ci-codeowners
 /ci/                     @rapidsai/ci-codeowners
-/.pre-commit-config.yaml @rapidsai/ci-codeowners
 
 #packaging code owners
-/.devcontainer/    @rapidsai/packaging-codeowners
-/conda/            @rapidsai/packaging-codeowners
-/dependencies.yaml @rapidsai/packaging-codeowners
-/build.sh          @rapidsai/packaging-codeowners
-pyproject.toml     @rapidsai/packaging-codeowners
+/.pre-commit-config.yaml @rapidsai/packaging-codeowners
+/.devcontainer/          @rapidsai/packaging-codeowners
+/conda/                  @rapidsai/packaging-codeowners
+dependencies.yaml        @rapidsai/packaging-codeowners
+/build.sh                @rapidsai/packaging-codeowners
+pyproject.toml           @rapidsai/packaging-codeowners
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -107,7 +107,7 @@ repos:
         hooks:
               - id: check-json
       - repo: https://github.com/rapidsai/pre-commit-hooks
-        rev: v0.4.0
+        rev: v0.5.0
         hooks:
           - id: verify-copyright
             files: |
@@ -122,6 +122,8 @@ repos:
                   cpp/include/raft/thirdparty/|
                   docs/source/sphinxext/github_link[.]py|
           - id: verify-alpha-spec
+          - id: verify-codeowners
+            args: [--fix, --project-prefix=raft]
       - repo: https://github.com/rapidsai/dependency-file-generator
         rev: v1.17.0
         hooks:

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4,!=3.30.0
+- cmake>=3.30.4
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.8.5,<12.0a0
@@ -55,6 +55,7 @@ dependencies:
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx<8.2.0
 - sysroot_linux-aarch64==2.28
 - ucx-py==0.43.*,>=0.0.0a0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4,!=3.30.0
+- cmake>=3.30.4
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
 - cuda-python>=11.8.5,<12.0a0
@@ -55,6 +55,7 @@ dependencies:
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx<8.2.0
 - sysroot_linux-64==2.28
 - ucx-py==0.43.*,>=0.0.0a0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-128_arch-aarch64.yaml b/conda/environments/all_cuda-128_arch-aarch64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4,!=3.30.0
+- cmake>=3.30.4
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
@@ -51,6 +51,7 @@ dependencies:
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx<8.2.0
 - sysroot_linux-aarch64==2.28
 - ucx-py==0.43.*,>=0.0.0a0
 name: all_cuda-128_arch-aarch64
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - c-compiler
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4,!=3.30.0
+- cmake>=3.30.4
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-nvtx-dev
@@ -51,6 +51,7 @@ dependencies:
 - scipy
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx<8.2.0
 - sysroot_linux-64==2.28
 - ucx-py==0.43.*,>=0.0.0a0
 name: all_cuda-128_arch-x86_64
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.28"
 
 cmake_version:
-  - ">=3.26.4,!=3.30.0"
+  - ">=3.30.4"
 
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages

diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -17,4 +17,4 @@ c_stdlib_version:
   - "2.28"
 
 cmake_version:
-  - ">=3.26.4,!=3.30.0"
+  - ">=3.30.4"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -23,7 +23,7 @@ ucxx_version:
   - "0.43.*"
 
 cmake_version:
-  - ">=3.26.4,!=3.30.0"
+  - ">=3.30.4"
 
 nccl_version:
   - ">=2.19"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -10,7 +10,7 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 include(../rapids_config.cmake)
 include(rapids-cmake)
 include(rapids-cpm)

diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -43,6 +43,7 @@
 
 #include <cuda.h>
 #include <thrust/fill.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <algorithm>
@@ -443,13 +444,12 @@ void kmeans_fit_main(raft::resources const& handle,
                                                         params.batch_centroids,
                                                         workspace);
 
-    // Using TransformInputIteratorT to dereference an array of
+    // Using thrust::transform_iterator to dereference an array of
     // raft::KeyValuePair and converting them to just return the Key to be used
     // in reduce_rows_by_key prims
     detail::KeyValueIndexOp<IndexT, DataT> conversion_op;
-    cub::TransformInputIterator<IndexT,
-                                detail::KeyValueIndexOp<IndexT, DataT>,
-                                raft::KeyValuePair<IndexT, DataT>*>
+    thrust::transform_iterator<detail::KeyValueIndexOp<IndexT, DataT>,
+                               raft::KeyValuePair<IndexT, DataT>*>
       itr(minClusterAndDistance.data_handle(), conversion_op);
 
     update_centroids(handle,

diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -47,6 +47,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <limits>
@@ -288,7 +289,8 @@ void calc_centers_and_sizes(const raft::resources& handle,
       dataset, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
   } else {
     // todo(lsugy): use iterator from KV output of fusedL2NN
-    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset, mapping_op);
+    thrust::transform_iterator<MappingOpT, const T*, thrust::use_default, MathT> mapping_itr(
+      dataset, mapping_op);
     raft::linalg::reduce_rows_by_key(
       mapping_itr, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
   }
@@ -894,7 +896,8 @@ auto build_fine_clusters(const raft::resources& handle,
                    "Number of fine clusters must be non-zero for a non-empty mesocluster");
     }
 
-    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset_mptr, mapping_op);
+    thrust::transform_iterator<MappingOpT, const T*, thrust::use_default, MathT> mapping_itr(
+      dataset_mptr, mapping_op);
     raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
     if (params.metric == raft::distance::DistanceType::L2Expanded ||
         params.metric == raft::distance::DistanceType::L2SqrtExpanded) {

diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh
@@ -43,6 +43,7 @@
 #include <cuda.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
 #include <cmath>
@@ -199,8 +200,8 @@ void computeClusterCost(raft::resources const& handle,
 {
   cudaStream_t stream = resource::get_cuda_stream(handle);
 
-  cub::TransformInputIterator<OutputT, MainOpT, InputT*> itr(minClusterDistance.data_handle(),
-                                                             main_op);
+  thrust::transform_iterator<MainOpT, InputT*, thrust::use_default, OutputT> itr(
+    minClusterDistance.data_handle(), main_op);
 
   size_t temp_storage_bytes = 0;
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(nullptr,
@@ -641,13 +642,12 @@ void countSamplesInCluster(raft::resources const& handle,
                                        params.batch_centroids,
                                        workspace);
 
-  // Using TransformInputIteratorT to dereference an array of raft::KeyValuePair
+  // Using thrust::transform_iterator to dereference an array of raft::KeyValuePair
   // and converting them to just return the Key to be used in reduce_rows_by_key
   // prims
   detail::KeyValueIndexOp<IndexT, DataT> conversion_op;
-  cub::TransformInputIterator<IndexT,
-                              detail::KeyValueIndexOp<IndexT, DataT>,
-                              raft::KeyValuePair<IndexT, DataT>*>
+  thrust::transform_iterator<detail::KeyValueIndexOp<IndexT, DataT>,
+                             raft::KeyValuePair<IndexT, DataT>*>
     itr(minClusterAndDistance.data_handle(), conversion_op);
 
   // count # of samples in each cluster

diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
@@ -168,7 +168,7 @@ class sparse_matrix {
                 row_type n_rows,
                 col_type n_cols,
                 nnz_type nnz = 0) noexcept(std::is_nothrow_default_constructible_v<container_type>)
-    : structure_{handle, n_rows, n_cols, nnz}, cp_{}, c_elements_{cp_.create(handle, 0)} {};
+    : structure_{handle, n_rows, n_cols, nnz}, cp_{}, c_elements_{cp_.create(handle, nnz)} {};
 
   // Constructor that owns the data but not the structure
   // This constructor is only callable with a `structure_type == *_structure_view`

diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -35,19 +35,19 @@ namespace detail {
 // Note that the compensation will only be performed 'per-block' for performance
 // reasons and therefore not be equivalent to a sequential compensation.
 
-template <typename Type, typename MainLambda>
+template <typename Type, typename IdxType, typename MainLambda>
 RAFT_KERNEL stridedSummationKernel(
-  Type* out, const Type* data, int D, int N, Type init, MainLambda main_op)
+  Type* out, const Type* data, IdxType D, IdxType N, Type init, MainLambda main_op)
 {
   // Thread reduction
-  Type thread_sum = Type(init);
-  Type thread_c   = Type(0);
-  int colStart    = blockIdx.x * blockDim.x + threadIdx.x;
+  Type thread_sum  = Type(init);
+  Type thread_c    = Type(0);
+  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
-    int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    int stride   = blockDim.y * gridDim.y;
-    for (int j = rowStart; j < N; j += stride) {
-      int idx = colStart + j * D;
+    IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
+    IdxType stride   = blockDim.y * gridDim.y;
+    for (IdxType j = rowStart; j < N; j += stride) {
+      auto idx = colStart + j * D;
 
       // KahanBabushkaNeumaierSum
       const Type cur_value = main_op(data[idx], j);
@@ -97,8 +97,8 @@ template <typename InType,
           typename ReduceLambda>
 RAFT_KERNEL stridedReductionKernel(OutType* dots,
                                    const InType* data,
-                                   int D,
-                                   int N,
+                                   IdxType D,
+                                   IdxType N,
                                    OutType init,
                                    MainLambda main_op,
                                    ReduceLambda reduce_op)
@@ -167,7 +167,7 @@ void stridedReduction(OutType* dots,
                     raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk)));
     const size_t shmemSize = sizeof(OutType) * Block.x * 2;
 
-    stridedSummationKernel<InType>
+    stridedSummationKernel<InType, IdxType>
       <<<grid, Block, shmemSize, stream>>>(dots, data, D, N, init, main_op);
   } else {
     // Arbitrary numbers for now, probably need to tune

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -53,6 +53,7 @@
 
 #include <cuda_fp16.h>
 #include <thrust/extrema.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
 #include <memory>
@@ -180,8 +181,8 @@ void select_residuals(raft::resources const& handle,
   rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
   // Note: the number of rows of the input dataset isn't actually n_rows, but matrix::gather doesn't
   // need to know it, any strictly positive number would work.
-  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(
-    dataset, utils::mapping<float>{});
+  thrust::transform_iterator<utils::mapping<float>, const T*> mapping_itr(dataset,
+                                                                          utils::mapping<float>{});
   raft::matrix::gather(mapping_itr, (IdxT)dim, n_rows, row_ids, n_rows, tmp.data(), stream);
 
   raft::matrix::linewise_op(handle,