Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into cln/column_empty/masked
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Dec 13, 2024
2 parents d183bea + 7749702 commit f792abe
Show file tree
Hide file tree
Showing 49 changed files with 1,732 additions and 2,325 deletions.
19 changes: 8 additions & 11 deletions cpp/benchmarks/string/case.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,14 @@

void bench_case(nvbench::state& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
auto const encoding = state.get_string("encoding");

if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);

auto col_view = column->view();

Expand Down Expand Up @@ -74,6 +70,7 @@ void bench_case(nvbench::state& state)

NVBENCH_BENCH(bench_case)
.set_name("case")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("encoding", {"ascii", "utf8"});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/char_types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,12 @@
static void bench_char_types(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
auto const api_type = state.get_string("api");

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
Expand All @@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state)

NVBENCH_BENCH(bench_char_types)
.set_name("char_types")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("api", {"all", "filter"});
13 changes: 4 additions & 9 deletions cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"

static void bench_contains(nvbench::state& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
auto const hit_rate = static_cast<cudf::size_type>(state.get_int64("hit_rate"));

if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

auto col = create_string_column(n_rows, row_width, hit_rate);
auto col = create_string_column(num_rows, row_width, hit_rate);
auto input = cudf::strings_column_view(col->view());

auto pattern = patterns[pattern_index];
Expand All @@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state)

NVBENCH_BENCH(bench_contains)
.set_name("contains")
.add_int64_axis("row_width", {32, 64, 128, 256, 512})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("row_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("hit_rate", {50, 100}) // percentage
.add_int64_axis("pattern", {0, 1, 2});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/copy_if_else.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,11 @@
static void bench_copy(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

data_profile const str_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const source_table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile);
auto const target_table =
Expand All @@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state)

NVBENCH_BENCH(bench_copy)
.set_name("copy_if_else")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/copy_range.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,12 @@
static void bench_copy_range(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
.no_validity();
auto const source_tables = create_random_table(
{cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
Expand All @@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state)

NVBENCH_BENCH(bench_copy_range)
.set_name("copy_range")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"};
static void bench_count(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
Expand All @@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state)

NVBENCH_BENCH(bench_count)
.set_name("count")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("pattern", {0, 1});
9 changes: 2 additions & 7 deletions cpp/benchmarks/string/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state)
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

auto groups = static_cast<cudf::size_type>(state.get_int64("groups"));

std::default_random_engine generator;
Expand Down Expand Up @@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state)

NVBENCH_BENCH(bench_extract)
.set_name("extract")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("row_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("groups", {1, 2, 4});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/join_strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,11 @@
static void bench_join(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
Expand All @@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state)

NVBENCH_BENCH(bench_join)
.set_name("strings_join")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/lengths.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,11 @@
static void bench_lengths(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
Expand All @@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state)

NVBENCH_BENCH(bench_lengths)
.set_name("lengths")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152});
9 changes: 2 additions & 7 deletions cpp/benchmarks/string/like.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state)
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const hit_rate = static_cast<int32_t>(state.get_int64("hit_rate"));

if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

auto col = create_string_column(n_rows, row_width, hit_rate);
auto input = cudf::strings_column_view(col->view());

Expand All @@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state)

NVBENCH_BENCH(bench_like)
.set_name("strings_like")
.add_int64_axis("row_width", {32, 64, 128, 256, 512})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("row_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("hit_rate", {10, 25, 70, 100});
19 changes: 8 additions & 11 deletions cpp/benchmarks/string/replace_re.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,14 @@

static void bench_replace(nvbench::state& state)
{
auto const n_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
auto const rtype = state.get_string("type");

if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
cudf::strings_column_view input(column->view());

auto program = cudf::strings::regex_program::create("(\\d+)");
Expand All @@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state)

NVBENCH_BENCH(bench_replace)
.set_name("replace_re")
.add_int64_axis("row_width", {32, 64, 128, 256, 512})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("type", {"replace", "backref"});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/reverse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,11 @@
static void bench_reverse(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));

data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const table =
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));
Expand All @@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state)

NVBENCH_BENCH(bench_reverse)
.set_name("reverse")
.add_int64_axis("row_width", {8, 16, 32, 64, 128})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152});
9 changes: 2 additions & 7 deletions cpp/benchmarks/string/slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state)
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const stype = state.get_string("type");

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
Expand Down Expand Up @@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state)

NVBENCH_BENCH(bench_slice)
.set_name("slice")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {262144, 2097152, 16777216})
.add_int64_axis("row_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("type", {"position", "multi"});
15 changes: 6 additions & 9 deletions cpp/benchmarks/string/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,12 @@
static void bench_split(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
auto const stype = state.get_string("type");

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
cudf::strings_column_view input(column->view());
cudf::string_scalar target("+");
Expand Down Expand Up @@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state)

NVBENCH_BENCH(bench_split)
.set_name("split")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("min_width", {0})
.add_int64_axis("max_width", {32, 64, 128, 256})
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_string_axis("type", {"split", "split_ws", "record", "record_ws"});
Loading

0 comments on commit f792abe

Please sign in to comment.