Fix the bug in determining the heuristics for shared memory groupby (#…

…17851) Fixes #17853 This PR fixes a bug in determining the heuristics for shared memory groupby, ensuring that the implementation selects the shared memory code path only when there is enough shared memory available to hold temporary aggregation results for at least one column. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: #17851
rapidsai · Jan 30, 2025 · 847fa28 · 847fa28
1 parent 802d3e0
commit 847fa28
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 18 deletions.
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,13 +67,17 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   auto const grid_size =
     max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
   auto const available_shmem_size = get_available_shared_memory_size(grid_size);
-  auto const has_sufficient_shmem =
-    available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2);
-  auto const has_dictionary_request = std::any_of(
-    requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
-      return cudf::is_dictionary(request.values.type());
+  auto const offsets_buffer_size  = compute_shmem_offsets_size(flattened_values.num_columns()) * 2;
+  auto const data_buffer_size     = available_shmem_size - offsets_buffer_size;
+  auto const is_shared_memory_compatible = std::all_of(
+    requests.begin(), requests.end(), [&](cudf::groupby::aggregation_request const& request) {
+      if (cudf::is_dictionary(request.values.type())) { return false; }
+      // Ensure there is enough buffer space to store local aggregations up to the max cardinality
+      // for shared memory aggregations
+      auto const size = cudf::type_dispatcher<cudf::dispatch_storage_type>(request.values.type(),
+                                                                           size_of_functor{});
+      return static_cast<size_type>(data_buffer_size) >= (size * GROUPBY_CARDINALITY_THRESHOLD);
     });
-  auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem;
 
   // Performs naive global memory aggregations when the workload is not compatible with shared
   // memory, such as when aggregating dictionary columns or when there is insufficient dynamic

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,15 +35,6 @@
 
 namespace cudf::groupby::detail::hash {
 namespace {
-/// Functor used by type dispatcher returning the size of the underlying C++ type
-struct size_of_functor {
-  template <typename T>
-  __device__ constexpr cudf::size_type operator()()
-  {
-    return sizeof(T);
-  }
-};
-
 /// Shared memory data alignment
 CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
 

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,15 @@
 #include <cuda/std/cstddef>
 
 namespace cudf::groupby::detail::hash {
+/// Functor used by type dispatcher returning the size of the underlying C++ type
+struct size_of_functor {
+  template <typename T>
+  CUDF_HOST_DEVICE constexpr cudf::size_type operator()()
+  {
+    return sizeof(T);
+  }
+};
+
 // TODO: TO BE REMOVED issue tracked via #17171
 template <typename T, cudf::aggregation::Kind k>
 __device__ constexpr bool is_supported()