Skip to content

Commit

Permalink
Fix the bug in determining the heuristics for shared memory groupby (#…
Browse files Browse the repository at this point in the history
…17851)

Fixes #17853

This PR fixes a bug in determining the heuristics for shared memory groupby, ensuring that the implementation selects the shared memory code path only when there is enough shared memory available to hold temporary aggregation results for at least one column.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: #17851
  • Loading branch information
PointKernel authored Jan 30, 2025
1 parent 802d3e0 commit 847fa28
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
18 changes: 11 additions & 7 deletions cpp/src/groupby/hash/compute_aggregations.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,13 +67,17 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
auto const grid_size =
max_occupancy_grid_size<typename SetType::ref_type<cuco::insert_and_find_tag>>(num_rows);
auto const available_shmem_size = get_available_shared_memory_size(grid_size);
auto const has_sufficient_shmem =
available_shmem_size > (compute_shmem_offsets_size(flattened_values.num_columns()) * 2);
auto const has_dictionary_request = std::any_of(
requests.begin(), requests.end(), [](cudf::groupby::aggregation_request const& request) {
return cudf::is_dictionary(request.values.type());
auto const offsets_buffer_size = compute_shmem_offsets_size(flattened_values.num_columns()) * 2;
auto const data_buffer_size = available_shmem_size - offsets_buffer_size;
auto const is_shared_memory_compatible = std::all_of(
requests.begin(), requests.end(), [&](cudf::groupby::aggregation_request const& request) {
if (cudf::is_dictionary(request.values.type())) { return false; }
// Ensure there is enough buffer space to store local aggregations up to the max cardinality
// for shared memory aggregations
auto const size = cudf::type_dispatcher<cudf::dispatch_storage_type>(request.values.type(),
size_of_functor{});
return static_cast<size_type>(data_buffer_size) >= (size * GROUPBY_CARDINALITY_THRESHOLD);
});
auto const is_shared_memory_compatible = !has_dictionary_request and has_sufficient_shmem;

// Performs naive global memory aggregations when the workload is not compatible with shared
// memory, such as when aggregating dictionary columns or when there is insufficient dynamic
Expand Down
11 changes: 1 addition & 10 deletions cpp/src/groupby/hash/compute_shared_memory_aggs.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,15 +35,6 @@

namespace cudf::groupby::detail::hash {
namespace {
/// Functor used by type dispatcher returning the size of the underlying C++ type
struct size_of_functor {
template <typename T>
__device__ constexpr cudf::size_type operator()()
{
return sizeof(T);
}
};

/// Shared memory data alignment
CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;

Expand Down
11 changes: 10 additions & 1 deletion cpp/src/groupby/hash/single_pass_functors.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,15 @@
#include <cuda/std/cstddef>

namespace cudf::groupby::detail::hash {
/// Functor used by type dispatcher returning the size of the underlying C++ type
struct size_of_functor {
template <typename T>
CUDF_HOST_DEVICE constexpr cudf::size_type operator()()
{
return sizeof(T);
}
};

// TODO: TO BE REMOVED issue tracked via #17171
template <typename T, cudf::aggregation::Kind k>
__device__ constexpr bool is_supported()
Expand Down

0 comments on commit 847fa28

Please sign in to comment.