From 856c985e6acc28d3e0550e33698c4fc3882dc008 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 9 Jan 2024 14:15:28 -0800 Subject: [PATCH 01/23] Split parquet test into multiple files (#14663) Compiling the parquet C++ tests takes a long time. This work attempts to address this by splitting the (mostly) monolithic `parquet_test.cpp` into multiple compilation units. On my workstation (10 core i7) I was able to reduce the compile time for PARQUET_TEST from 48s down to 23s. This PR also splits the testing `base_fixture.hpp` file into three parts to cut down on unnecessary rmm includes. This change resulted in a time savings of around 20% when compiling the entire test suite locally. Authors: - Ed Seidl (https://github.com/etseidl) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14663 --- cpp/include/cudf_test/base_fixture.hpp | 308 +- cpp/include/cudf_test/random.hpp | 173 + cpp/include/cudf_test/testing_main.hpp | 178 + cpp/tests/CMakeLists.txt | 10 +- cpp/tests/ast/transform_tests.cpp | 4 +- cpp/tests/binaryop/binop-compiled-test.cpp | 3 +- cpp/tests/binaryop/binop-fixture.hpp | 3 +- cpp/tests/bitmask/bitmask_tests.cpp | 4 +- cpp/tests/column/column_test.cpp | 3 +- cpp/tests/copying/concatenate_tests.cpp | 3 +- cpp/tests/copying/gather_tests.cpp | 3 +- cpp/tests/copying/utility_tests.cpp | 3 +- cpp/tests/datetime/datetime_ops_test.cpp | 3 +- .../device_atomics/device_atomics_test.cu | 4 +- cpp/tests/dictionary/add_keys_test.cpp | 3 +- cpp/tests/encode/encode_tests.cpp | 3 +- cpp/tests/error/error_handling_test.cu | 3 +- cpp/tests/filling/fill_tests.cpp | 3 +- cpp/tests/filling/repeat_tests.cpp | 3 +- cpp/tests/fixed_point/fixed_point_tests.cpp | 3 +- cpp/tests/groupby/argmax_tests.cpp | 3 +- cpp/tests/hash_map/map_test.cu | 3 +- cpp/tests/hashing/murmurhash3_x86_32_test.cpp | 3 +- cpp/tests/interop/to_arrow_test.cpp | 3 +- cpp/tests/io/arrow_io_source_test.cpp | 3 +- cpp/tests/io/comp/decomp_test.cpp | 1 + cpp/tests/io/csv_test.cpp | 4 +- cpp/tests/io/file_io_test.cpp | 3 +- cpp/tests/io/fst/fst_test.cu | 3 +- cpp/tests/io/fst/logical_stack_test.cu | 3 +- cpp/tests/io/fst/quote_normalization_test.cu | 1 + cpp/tests/io/json_test.cpp | 4 +- cpp/tests/io/json_tree.cpp | 4 +- cpp/tests/io/json_type_cast_test.cu | 3 +- cpp/tests/io/json_writer.cpp | 3 +- cpp/tests/io/nested_json_test.cpp | 4 +- cpp/tests/io/orc_test.cpp | 2 + cpp/tests/io/parquet_chunked_writer_test.cpp | 855 ++ cpp/tests/io/parquet_common.cpp | 798 ++ cpp/tests/io/parquet_common.hpp | 264 + cpp/tests/io/parquet_misc_test.cpp | 235 + cpp/tests/io/parquet_reader_test.cpp | 2340 ++++++ cpp/tests/io/parquet_test.cpp | 7292 +---------------- cpp/tests/io/parquet_v2_test.cpp | 1528 ++++ cpp/tests/io/parquet_writer_test.cpp | 1720 ++++ cpp/tests/io/row_selection_test.cpp | 3 +- cpp/tests/io/text/data_chunk_source_test.cpp | 3 +- cpp/tests/io/text/multibyte_split_test.cpp | 3 +- cpp/tests/io/type_inference_test.cu | 3 +- .../optional_iterator_test_numeric.cu | 4 +- .../iterator/pair_iterator_test_numeric.cu | 4 +- cpp/tests/iterator/scalar_iterator_test.cu | 4 +- cpp/tests/iterator/value_iterator.cpp | 3 +- .../iterator/value_iterator_test_transform.cu | 4 +- cpp/tests/jit/parse_ptx_function.cpp | 3 +- cpp/tests/join/join_tests.cpp | 3 +- cpp/tests/labeling/label_bins_tests.cpp | 3 +- cpp/tests/lists/extract_tests.cpp | 3 +- cpp/tests/merge/merge_test.cpp | 3 +- .../partitioning/hash_partition_test.cpp | 3 +- cpp/tests/quantiles/quantile_test.cpp | 3 +- cpp/tests/reductions/reduction_tests.cpp | 3 +- cpp/tests/replace/clamp_test.cpp | 3 +- cpp/tests/replace/normalize_replace_tests.cpp | 3 +- cpp/tests/replace/replace_nans_tests.cpp | 3 +- cpp/tests/replace/replace_nulls_tests.cpp | 3 +- cpp/tests/replace/replace_tests.cpp | 3 +- .../reshape/interleave_columns_tests.cpp | 3 +- cpp/tests/rolling/rolling_test.cpp | 4 +- cpp/tests/round/round_tests.cpp | 3 +- cpp/tests/scalar/factories_test.cpp | 3 +- cpp/tests/scalar/scalar_test.cpp | 3 +- cpp/tests/search/search_test.cpp | 3 +- cpp/tests/sort/is_sorted_tests.cpp | 3 +- cpp/tests/sort/sort_test.cpp | 3 +- .../apply_boolean_mask_tests.cpp | 4 +- cpp/tests/strings/array_tests.cpp | 3 +- cpp/tests/structs/structs_column_tests.cpp | 3 +- cpp/tests/table/table_tests.cpp | 3 +- cpp/tests/text/ngrams_tests.cpp | 3 +- cpp/tests/transform/bools_to_mask_test.cpp | 3 +- .../integration/unary_transform_test.cpp | 3 +- cpp/tests/transform/row_conversion.cpp | 3 +- cpp/tests/transpose/transpose_test.cpp | 3 +- cpp/tests/types/traits_test.cpp | 3 +- cpp/tests/types/type_dispatcher_test.cu | 3 +- cpp/tests/unary/unary_ops_test.cpp | 3 +- .../column_utilities_tests.cpp | 4 +- .../utilities_tests/column_wrapper_tests.cpp | 3 +- cpp/tests/utilities_tests/span_tests.cu | 3 +- cpp/tests/wrappers/timestamps_test.cu | 3 +- 91 files changed, 8282 insertions(+), 7665 deletions(-) create mode 100644 cpp/include/cudf_test/random.hpp create mode 100644 cpp/include/cudf_test/testing_main.hpp create mode 100644 cpp/tests/io/parquet_chunked_writer_test.cpp create mode 100644 cpp/tests/io/parquet_common.cpp create mode 100644 cpp/tests/io/parquet_common.hpp create mode 100644 cpp/tests/io/parquet_misc_test.cpp create mode 100644 cpp/tests/io/parquet_reader_test.cpp create mode 100644 cpp/tests/io/parquet_v2_test.cpp create mode 100644 cpp/tests/io/parquet_writer_test.cpp diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp index 06aabbe4e9c..14b94e061ae 100644 --- a/cpp/include/cudf_test/base_fixture.hpp +++ b/cpp/include/cudf_test/base_fixture.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,29 +16,17 @@ #pragma once -#include - -#include -#include -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include + +#include #include -#include namespace cudf { namespace test { + /** * @brief Base test fixture class from which all libcudf tests should inherit. * @@ -80,152 +68,6 @@ class BaseFixtureWithParam : public ::testing::TestWithParam { rmm::mr::device_memory_resource* mr() const { return _mr; } }; -template -struct uniform_distribution_impl {}; -template -struct uniform_distribution_impl>> { - using type = std::uniform_int_distribution; -}; - -template <> -struct uniform_distribution_impl { - using type = std::bernoulli_distribution; -}; - -template -struct uniform_distribution_impl>> { - using type = std::uniform_real_distribution; -}; - -template -struct uniform_distribution_impl< - T, - std::enable_if_t() or cudf::is_fixed_point()>> { - using type = std::uniform_int_distribution; -}; - -template -using uniform_distribution_t = typename uniform_distribution_impl::type; - -namespace detail { - -/** - * @brief Returns an incrementing seed value for use with UniformRandomGenerator. - * - * The intent behind this is to handle the following case: - * - * auto lhs = make_random_wrapped_column(10000); - * auto rhs = make_random_wrapped_column(10000); - * - * Previously, the binops test framework had a persistent UniformRandomGenerator - * that would produce unique values across two calls to make_random_wrapped_column() - * like this. However that code has been changed and each call to make_random_wrapped_column() - * now uses a local UniformRandomGenerator object. If we didn't generate an incrementing seed - * for each one, every call to make_random_wrapped_column() would return the same values. This - * fixes that case and also leaves results across multiple test runs deterministic. - */ -uint64_t random_generator_incrementing_seed(); - -} // namespace detail - -/** - * @brief Provides uniform random number generation. - * - * It is often useful in testing to have a convenient source of random numbers. - * This class is intended to serve as a base class for test fixtures to provide - * random number generation. `UniformRandomGenerator::generate()` will generate - * the next random number in the sequence. - * - * Example: - * ```c++ - * UniformRandomGenerator g(0,100); - * g.generate(); // Returns a random number in the range [0,100] - * ``` - * - * @tparam T The type of values that will be generated. - */ -template -class UniformRandomGenerator { - public: - using uniform_distribution = uniform_distribution_t; ///< The uniform distribution type for T. - - UniformRandomGenerator() : rng{std::mt19937_64{detail::random_generator_incrementing_seed()}()} {} - - /** - * @brief Construct a new Uniform Random Generator to generate uniformly - * random numbers in the range `[upper,lower]` - * - * @param lower Lower bound of the range - * @param upper Upper bound of the desired range - * @param seed seed to initialize generator with - */ - template () && !cudf::is_boolean()>* = nullptr> - UniformRandomGenerator(T lower, - T upper, - uint64_t seed = detail::random_generator_incrementing_seed()) - : dist{lower, upper}, rng{std::mt19937_64{seed}()} - { - } - - /** - * @brief Construct a new Uniform Random Generator to generate uniformly random booleans - * - * @param lower ignored - * @param upper ignored - * @param seed seed to initialize generator with - */ - template ()>* = nullptr> - UniformRandomGenerator(T lower, - T upper, - uint64_t seed = detail::random_generator_incrementing_seed()) - : dist{0.5}, rng{std::mt19937_64{seed}()} - { - } - - /** - * @brief Construct a new Uniform Random Generator to generate uniformly - * random numbers in the range `[upper,lower]` - * - * @param lower Lower bound of the range - * @param upper Upper bound of the desired range - * @param seed seed to initialize generator with - */ - template () or cudf::is_fixed_point()>* = nullptr> - UniformRandomGenerator(typename TL::rep lower, - typename TL::rep upper, - uint64_t seed = detail::random_generator_incrementing_seed()) - : dist{lower, upper}, rng{std::mt19937_64{seed}()} - { - } - - /** - * @brief Returns the next random number. - * - * @return generated random number - */ - template ()>* = nullptr> - T generate() - { - return T{dist(rng)}; - } - - /** - * @brief Returns the next random number. - * @return generated random number - */ - template ()>* = nullptr> - T generate() - { - return T{typename T::duration{dist(rng)}}; - } - - private: - uniform_distribution dist{}; ///< Distribution - Engine rng; ///< Random generator -}; - /** * @brief Provides temporary directory for temporary test files. * @@ -255,145 +97,5 @@ class TempDirTestEnvironment : public ::testing::Environment { std::string get_temp_filepath(std::string filename) { return tmpdir.path() + filename; } }; -/// MR factory functions -inline auto make_cuda() { return std::make_shared(); } - -inline auto make_async() { return std::make_shared(); } - -inline auto make_managed() { return std::make_shared(); } - -inline auto make_pool() -{ - auto const [free, total] = rmm::detail::available_device_memory(); - auto min_alloc = - rmm::detail::align_down(std::min(free, total / 10), rmm::detail::CUDA_ALLOCATION_ALIGNMENT); - return rmm::mr::make_owning_wrapper(make_cuda(), min_alloc); -} - -inline auto make_arena() -{ - return rmm::mr::make_owning_wrapper(make_cuda()); -} - -inline auto make_binning() -{ - auto pool = make_pool(); - // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB - // Larger allocations will use the pool resource - auto mr = rmm::mr::make_owning_wrapper(pool, 18, 22); - return mr; -} - -/** - * @brief Creates a memory resource for the unit test environment - * given the name of the allocation mode. - * - * The returned resource instance must be kept alive for the duration of - * the tests. Attaching the resource to a TestEnvironment causes - * issues since the environment objects are not destroyed until - * after the runtime is shutdown. - * - * @throw cudf::logic_error if the `allocation_mode` is unsupported. - * - * @param allocation_mode String identifies which resource type. - * Accepted types are "pool", "cuda", and "managed" only. - * @return Memory resource instance - */ -inline std::shared_ptr create_memory_resource( - std::string const& allocation_mode) -{ - if (allocation_mode == "binning") return make_binning(); - if (allocation_mode == "cuda") return make_cuda(); - if (allocation_mode == "async") return make_async(); - if (allocation_mode == "pool") return make_pool(); - if (allocation_mode == "arena") return make_arena(); - if (allocation_mode == "managed") return make_managed(); - CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode); -} - } // namespace test } // namespace cudf - -/** - * @brief Parses the cuDF test command line options. - * - * Currently only supports 'rmm_mode' string parameter, which set the rmm - * allocation mode. The default value of the parameter is 'pool'. - * Environment variable 'CUDF_TEST_RMM_MODE' can also be used to set the rmm - * allocation mode. If both are set, the value of 'rmm_mode' string parameter - * takes precedence. - * - * @return Parsing results in the form of unordered map - */ -inline auto parse_cudf_test_opts(int argc, char** argv) -{ - try { - cxxopts::Options options(argv[0], " - cuDF tests command line options"); - char const* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE"); // Overridden by CLI options - char const* env_stream_mode = - std::getenv("GTEST_CUDF_STREAM_MODE"); // Overridden by CLI options - char const* env_stream_error_mode = - std::getenv("GTEST_CUDF_STREAM_ERROR_MODE"); // Overridden by CLI options - auto default_rmm_mode = env_rmm_mode ? env_rmm_mode : "pool"; - auto default_stream_mode = env_stream_mode ? env_stream_mode : "default"; - auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error"; - options.allow_unrecognised_options().add_options()( - "rmm_mode", - "RMM allocation mode", - cxxopts::value()->default_value(default_rmm_mode)); - // `new_cudf_default` means that cudf::get_default_stream has been patched, - // so we raise errors anywhere that a CUDA default stream is observed - // instead of cudf::get_default_stream(). This corresponds to compiling - // identify_stream_usage with STREAM_MODE_TESTING=OFF (must do both at the - // same time). - // `new_testing_default` means that cudf::test::get_default_stream has been - // patched, so we raise errors anywhere that _any_ other stream is - // observed. This corresponds to compiling identify_stream_usage with - // STREAM_MODE_TESTING=ON (must do both at the same time). - options.allow_unrecognised_options().add_options()( - "stream_mode", - "Whether to use a non-default stream", - cxxopts::value()->default_value(default_stream_mode)); - options.allow_unrecognised_options().add_options()( - "stream_error_mode", - "Whether to error or print to stdout when a non-default stream is observed and stream_mode " - "is not \"default\"", - cxxopts::value()->default_value(default_stream_error_mode)); - return options.parse(argc, argv); - } catch (cxxopts::OptionException const& e) { - CUDF_FAIL("Error parsing command line options"); - } -} - -/** - * @brief Macro that defines main function for gtest programs that use rmm - * - * Should be included in every test program that uses rmm allocators since - * it maintains the lifespan of the rmm default memory resource. - * This `main` function is a wrapper around the google test generated `main`, - * maintaining the original functionality. In addition, this custom `main` - * function parses the command line to customize test behavior, like the - * allocation mode used for creating the default memory resource. - */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cudf::test::create_memory_resource(rmm_mode); \ - rmm::mr::set_current_device_resource(resource.get()); \ - \ - auto const stream_mode = cmd_opts["stream_mode"].as(); \ - if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { \ - auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); \ - auto const error_on_invalid_stream = (stream_error_mode == "error"); \ - auto const check_default_stream = (stream_mode == "new_cudf_default"); \ - auto adaptor = make_stream_checking_resource_adaptor( \ - resource.get(), error_on_invalid_stream, check_default_stream); \ - rmm::mr::set_current_device_resource(&adaptor); \ - return RUN_ALL_TESTS(); \ - } \ - \ - return RUN_ALL_TESTS(); \ - } diff --git a/cpp/include/cudf_test/random.hpp b/cpp/include/cudf_test/random.hpp new file mode 100644 index 00000000000..498bacc81c9 --- /dev/null +++ b/cpp/include/cudf_test/random.hpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace test { + +template +struct uniform_distribution_impl {}; +template +struct uniform_distribution_impl>> { + using type = std::uniform_int_distribution; +}; + +template <> +struct uniform_distribution_impl { + using type = std::bernoulli_distribution; +}; + +template +struct uniform_distribution_impl>> { + using type = std::uniform_real_distribution; +}; + +template +struct uniform_distribution_impl< + T, + std::enable_if_t() or cudf::is_fixed_point()>> { + using type = std::uniform_int_distribution; +}; + +template +using uniform_distribution_t = typename uniform_distribution_impl::type; + +namespace detail { + +/** + * @brief Returns an incrementing seed value for use with UniformRandomGenerator. + * + * The intent behind this is to handle the following case: + * + * auto lhs = make_random_wrapped_column(10000); + * auto rhs = make_random_wrapped_column(10000); + * + * Previously, the binops test framework had a persistent UniformRandomGenerator + * that would produce unique values across two calls to make_random_wrapped_column() + * like this. However that code has been changed and each call to make_random_wrapped_column() + * now uses a local UniformRandomGenerator object. If we didn't generate an incrementing seed + * for each one, every call to make_random_wrapped_column() would return the same values. This + * fixes that case and also leaves results across multiple test runs deterministic. + */ +uint64_t random_generator_incrementing_seed(); + +} // namespace detail + +/** + * @brief Provides uniform random number generation. + * + * It is often useful in testing to have a convenient source of random numbers. + * This class is intended to serve as a base class for test fixtures to provide + * random number generation. `UniformRandomGenerator::generate()` will generate + * the next random number in the sequence. + * + * Example: + * ```c++ + * UniformRandomGenerator g(0,100); + * g.generate(); // Returns a random number in the range [0,100] + * ``` + * + * @tparam T The type of values that will be generated. + */ +template +class UniformRandomGenerator { + public: + using uniform_distribution = uniform_distribution_t; ///< The uniform distribution type for T. + + UniformRandomGenerator() : rng{std::mt19937_64{detail::random_generator_incrementing_seed()}()} {} + + /** + * @brief Construct a new Uniform Random Generator to generate uniformly + * random numbers in the range `[upper,lower]` + * + * @param lower Lower bound of the range + * @param upper Upper bound of the desired range + * @param seed seed to initialize generator with + */ + template () && !cudf::is_boolean()>* = nullptr> + UniformRandomGenerator(T lower, + T upper, + uint64_t seed = detail::random_generator_incrementing_seed()) + : dist{lower, upper}, rng{std::mt19937_64{seed}()} + { + } + + /** + * @brief Construct a new Uniform Random Generator to generate uniformly random booleans + * + * @param lower ignored + * @param upper ignored + * @param seed seed to initialize generator with + */ + template ()>* = nullptr> + UniformRandomGenerator(T lower, + T upper, + uint64_t seed = detail::random_generator_incrementing_seed()) + : dist{0.5}, rng{std::mt19937_64{seed}()} + { + } + + /** + * @brief Construct a new Uniform Random Generator to generate uniformly + * random numbers in the range `[upper,lower]` + * + * @param lower Lower bound of the range + * @param upper Upper bound of the desired range + * @param seed seed to initialize generator with + */ + template () or cudf::is_fixed_point()>* = nullptr> + UniformRandomGenerator(typename TL::rep lower, + typename TL::rep upper, + uint64_t seed = detail::random_generator_incrementing_seed()) + : dist{lower, upper}, rng{std::mt19937_64{seed}()} + { + } + + /** + * @brief Returns the next random number. + * + * @return generated random number + */ + template ()>* = nullptr> + T generate() + { + return T{dist(rng)}; + } + + /** + * @brief Returns the next random number. + * @return generated random number + */ + template ()>* = nullptr> + T generate() + { + return T{typename T::duration{dist(rng)}}; + } + + private: + uniform_distribution dist{}; ///< Distribution + Engine rng; ///< Random generator +}; + +} // namespace test +} // namespace cudf diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp new file mode 100644 index 00000000000..12dbb4c7851 --- /dev/null +++ b/cpp/include/cudf_test/testing_main.hpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf { +namespace test { + +/// MR factory functions +inline auto make_cuda() { return std::make_shared(); } + +inline auto make_async() { return std::make_shared(); } + +inline auto make_managed() { return std::make_shared(); } + +inline auto make_pool() +{ + auto const [free, total] = rmm::detail::available_device_memory(); + auto min_alloc = + rmm::detail::align_down(std::min(free, total / 10), rmm::detail::CUDA_ALLOCATION_ALIGNMENT); + return rmm::mr::make_owning_wrapper(make_cuda(), min_alloc); +} + +inline auto make_arena() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_binning() +{ + auto pool = make_pool(); + // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB + // Larger allocations will use the pool resource + auto mr = rmm::mr::make_owning_wrapper(pool, 18, 22); + return mr; +} + +/** + * @brief Creates a memory resource for the unit test environment + * given the name of the allocation mode. + * + * The returned resource instance must be kept alive for the duration of + * the tests. Attaching the resource to a TestEnvironment causes + * issues since the environment objects are not destroyed until + * after the runtime is shutdown. + * + * @throw cudf::logic_error if the `allocation_mode` is unsupported. + * + * @param allocation_mode String identifies which resource type. + * Accepted types are "pool", "cuda", and "managed" only. + * @return Memory resource instance + */ +inline std::shared_ptr create_memory_resource( + std::string const& allocation_mode) +{ + if (allocation_mode == "binning") return make_binning(); + if (allocation_mode == "cuda") return make_cuda(); + if (allocation_mode == "async") return make_async(); + if (allocation_mode == "pool") return make_pool(); + if (allocation_mode == "arena") return make_arena(); + if (allocation_mode == "managed") return make_managed(); + CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode); +} + +} // namespace test +} // namespace cudf + +/** + * @brief Parses the cuDF test command line options. + * + * Currently only supports 'rmm_mode' string parameter, which set the rmm + * allocation mode. The default value of the parameter is 'pool'. + * Environment variable 'CUDF_TEST_RMM_MODE' can also be used to set the rmm + * allocation mode. If both are set, the value of 'rmm_mode' string parameter + * takes precedence. + * + * @return Parsing results in the form of unordered map + */ +inline auto parse_cudf_test_opts(int argc, char** argv) +{ + try { + cxxopts::Options options(argv[0], " - cuDF tests command line options"); + char const* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE"); // Overridden by CLI options + char const* env_stream_mode = + std::getenv("GTEST_CUDF_STREAM_MODE"); // Overridden by CLI options + char const* env_stream_error_mode = + std::getenv("GTEST_CUDF_STREAM_ERROR_MODE"); // Overridden by CLI options + auto default_rmm_mode = env_rmm_mode ? env_rmm_mode : "pool"; + auto default_stream_mode = env_stream_mode ? env_stream_mode : "default"; + auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error"; + options.allow_unrecognised_options().add_options()( + "rmm_mode", + "RMM allocation mode", + cxxopts::value()->default_value(default_rmm_mode)); + // `new_cudf_default` means that cudf::get_default_stream has been patched, + // so we raise errors anywhere that a CUDA default stream is observed + // instead of cudf::get_default_stream(). This corresponds to compiling + // identify_stream_usage with STREAM_MODE_TESTING=OFF (must do both at the + // same time). + // `new_testing_default` means that cudf::test::get_default_stream has been + // patched, so we raise errors anywhere that _any_ other stream is + // observed. This corresponds to compiling identify_stream_usage with + // STREAM_MODE_TESTING=ON (must do both at the same time). + options.allow_unrecognised_options().add_options()( + "stream_mode", + "Whether to use a non-default stream", + cxxopts::value()->default_value(default_stream_mode)); + options.allow_unrecognised_options().add_options()( + "stream_error_mode", + "Whether to error or print to stdout when a non-default stream is observed and stream_mode " + "is not \"default\"", + cxxopts::value()->default_value(default_stream_error_mode)); + return options.parse(argc, argv); + } catch (cxxopts::OptionException const& e) { + CUDF_FAIL("Error parsing command line options"); + } +} + +/** + * @brief Macro that defines main function for gtest programs that use rmm + * + * Should be included in every test program that uses rmm allocators since + * it maintains the lifespan of the rmm default memory resource. + * This `main` function is a wrapper around the google test generated `main`, + * maintaining the original functionality. In addition, this custom `main` + * function parses the command line to customize test behavior, like the + * allocation mode used for creating the default memory resource. + */ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ + auto resource = cudf::test::create_memory_resource(rmm_mode); \ + rmm::mr::set_current_device_resource(resource.get()); \ + \ + auto const stream_mode = cmd_opts["stream_mode"].as(); \ + if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { \ + auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); \ + auto const error_on_invalid_stream = (stream_error_mode == "error"); \ + auto const check_default_stream = (stream_mode == "new_cudf_default"); \ + auto adaptor = make_stream_checking_resource_adaptor( \ + resource.get(), error_on_invalid_stream, check_default_stream); \ + rmm::mr::set_current_device_resource(&adaptor); \ + return RUN_ALL_TESTS(); \ + } \ + \ + return RUN_ALL_TESTS(); \ + } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d0abcc225d1..48bc4ac6fc1 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -286,7 +286,15 @@ ConfigureTest( PERCENT 30 ) ConfigureTest( - PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp + PARQUET_TEST + io/parquet_test.cpp + io/parquet_chunked_reader_test.cpp + io/parquet_chunked_writer_test.cpp + io/parquet_common.cpp + io/parquet_misc_test.cpp + io/parquet_reader_test.cpp + io/parquet_writer_test.cpp + io/parquet_v2_test.cpp GPUS 1 PERCENT 30 ) diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 0476cb17693..01842969268 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -38,6 +39,7 @@ #include #include +#include #include #include #include diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 3c5adafc894..27865bd062f 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/binaryop/binop-fixture.hpp b/cpp/tests/binaryop/binop-fixture.hpp index bc3820da822..68e8b0f6fc6 100644 --- a/cpp/tests/binaryop/binop-fixture.hpp +++ b/cpp/tests/binaryop/binop-fixture.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -21,6 +21,7 @@ #include #include +#include #include diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index ab008b51b51..1dd39c1c7ae 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index b278e4928e5..1ba9b14dc1f 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index b8faa0bd081..06fb687ac2d 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp index 3120b737427..284b6c4c50c 100644 --- a/cpp/tests/copying/gather_tests.cpp +++ b/cpp/tests/copying/gather_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp index dadb8ea4eb8..f69bea2834f 100644 --- a/cpp/tests/copying/utility_tests.cpp +++ b/cpp/tests/copying/utility_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 6bae20efa8c..b58cd0e0cb9 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 24195362d92..f0c69ea6bfb 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,8 @@ #include #include +#include +#include #include #include diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp index adbcf41a66c..32a6885df09 100644 --- a/cpp/tests/dictionary/add_keys_test.cpp +++ b/cpp/tests/dictionary/add_keys_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include diff --git a/cpp/tests/encode/encode_tests.cpp b/cpp/tests/encode/encode_tests.cpp index 3638706ba7b..87818e16bb9 100644 --- a/cpp/tests/encode/encode_tests.cpp +++ b/cpp/tests/encode/encode_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include template diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index 5b842322681..6bb1afda2a8 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp index 564f5547009..95a27defa4e 100644 --- a/cpp/tests/filling/fill_tests.cpp +++ b/cpp/tests/filling/fill_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp index 4f74523ec7c..6326765c68b 100644 --- a/cpp/tests/filling/repeat_tests.cpp +++ b/cpp/tests/filling/repeat_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index 227a75d1bd5..1c1680fcd6e 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp index e0c5f37d1b8..f9d034ad0c7 100644 --- a/cpp/tests/groupby/argmax_tests.cpp +++ b/cpp/tests/groupby/argmax_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index 8d71c512c79..0232696a123 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp index a4b16550398..c3cc20c28b7 100644 --- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index d6762e70d80..82c4ad7d2f1 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 979f8e4fb05..ffdf2c7e00f 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 7bff730afe6..cf5a4f1fda5 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 8922658ac97..c6e9114605b 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,9 @@ #include #include #include +#include #include +#include #include #include diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp index c9a17185bc7..3c41f21b0a4 100644 --- a/cpp/tests/io/file_io_test.cpp +++ b/cpp/tests/io/file_io_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 74ccde6c364..4064204c56d 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 20b8674a717..f434736d7f5 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include #include diff --git a/cpp/tests/io/fst/quote_normalization_test.cu b/cpp/tests/io/fst/quote_normalization_test.cu index e2636ab029f..d0794b8f17e 100644 --- a/cpp/tests/io/fst/quote_normalization_test.cu +++ b/cpp/tests/io/fst/quote_normalization_test.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 09c9179de82..7fce31461ef 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include #include #include diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index 56e2404b683..f5d03293d30 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,10 +26,12 @@ #include #include #include +#include #include #include +#include #include #include diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 9eb5e8f5230..036b9170250 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp index a85a696565b..946b939f456 100644 --- a/cpp/tests/io/json_writer.cpp +++ b/cpp/tests/io/json_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 93ad05a29fe..070ac5ce870 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,7 +30,9 @@ #include #include #include +#include #include +#include #include diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 5124ac579fd..2ae6edc6c7d 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include #include #include diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp new file mode 100644 index 00000000000..a0c9641097b --- /dev/null +++ b/cpp/tests/io/parquet_chunked_writer_test.cpp @@ -0,0 +1,855 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_common.hpp" + +#include +#include +#include + +#include + +// Base test fixture for chunked writer tests +struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {}; + +// Typed test fixture for numeric type tests +template +struct ParquetChunkedWriterNumericTypeTest : public ParquetChunkedWriterTest { + auto type() { return cudf::data_type{cudf::type_to_id()}; } +}; + +TEST_F(ParquetChunkedWriterTest, SingleTable) +{ + srand(31337); + auto table1 = create_random_fixed_table(5, 5, true); + + auto filepath = temp_env->get_temp_filepath("ChunkedSingle.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(*table1); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1); +} + +TEST_F(ParquetChunkedWriterTest, SimpleTable) +{ + srand(31337); + auto table1 = create_random_fixed_table(5, 5, true); + auto table2 = create_random_fixed_table(5, 5, true); + + auto full_table = cudf::concatenate(std::vector({*table1, *table2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); +} + +TEST_F(ParquetChunkedWriterTest, LargeTables) +{ + srand(31337); + auto table1 = create_random_fixed_table(512, 4096, true); + auto table2 = create_random_fixed_table(512, 8192, true); + + auto full_table = cudf::concatenate(std::vector({*table1, *table2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + auto md = cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2).close(); + ASSERT_EQ(md, nullptr); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); +} + +TEST_F(ParquetChunkedWriterTest, ManyTables) +{ + srand(31337); + std::vector> tables; + std::vector table_views; + constexpr int num_tables = 96; + for (int idx = 0; idx < num_tables; idx++) { + auto tbl = create_random_fixed_table(16, 64, true); + table_views.push_back(*tbl); + tables.push_back(std::move(tbl)); + } + + auto expected = cudf::concatenate(table_views); + + auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer writer(args); + std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) { + writer.write(tbl); + }); + auto md = writer.close({"dummy/path"}); + ASSERT_NE(md, nullptr); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetChunkedWriterTest, Strings) +{ + std::vector> cols; + + bool mask1[] = {true, true, false, true, true, true, true}; + std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; + cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); + cols.push_back(strings1.release()); + cudf::table tbl1(std::move(cols)); + + bool mask2[] = {false, true, true, true, true, true, true}; + std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; + cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); + cols.push_back(strings2.release()); + cudf::table tbl2(std::move(cols)); + + auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetChunkedWriterTest, ListColumn) +{ + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + + using lcw = cudf::test::lists_column_wrapper; + + // COL0 (Same nullability) ==================== + // [NULL, 2, NULL] + // [] + // [4, 5] + // NULL + lcw col0_tbl0{{{{1, 2, 3}, valids}, {}, {4, 5}, {}}, valids2}; + + // [7, 8, 9] + // [] + // [NULL, 11] + // NULL + lcw col0_tbl1{{{7, 8, 9}, {}, {{10, 11}, valids}, {}}, valids2}; + + // COL1 (Nullability different in different chunks, test of merging nullability in writer) + // [NULL, 2, NULL] + // [] + // [4, 5] + // [] + lcw col1_tbl0{{{1, 2, 3}, valids}, {}, {4, 5}, {}}; + + // [7, 8, 9] + // [] + // [10, 11] + // NULL + lcw col1_tbl1{{{7, 8, 9}, {}, {10, 11}, {}}, valids2}; + + // COL2 (non-nested columns to test proper schema construction) + size_t num_rows_tbl0 = static_cast(col0_tbl0).size(); + size_t num_rows_tbl1 = static_cast(col0_tbl1).size(); + auto seq_col0 = random_values(num_rows_tbl0); + auto seq_col1 = random_values(num_rows_tbl1); + + column_wrapper col2_tbl0{seq_col0.begin(), seq_col0.end(), valids}; + column_wrapper col2_tbl1{seq_col1.begin(), seq_col1.end(), valids2}; + + auto tbl0 = table_view({col0_tbl0, col1_tbl0, col2_tbl0}); + auto tbl1 = table_view({col0_tbl1, col1_tbl1, col2_tbl1}); + + auto expected = cudf::concatenate(std::vector({tbl0, tbl1})); + + auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(tbl0).write(tbl1); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetChunkedWriterTest, ListOfStruct) +{ + // Table 1 + auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; + auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; + auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1}; + auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false}}; + auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; + + auto list_offsets_column_1 = + cudf::test::fixed_width_column_wrapper{0, 2, 3, 3}.release(); + auto num_list_rows_1 = list_offsets_column_1->size() - 1; + + auto list_col_1 = cudf::make_lists_column( + num_list_rows_1, std::move(list_offsets_column_1), struct_2_1.release(), 0, {}); + + auto table_1 = table_view({*list_col_1}); + + // Table 2 + auto weight_2 = cudf::test::fixed_width_column_wrapper{{1.1, -1.0, -1.0}}; + auto ages_2 = cudf::test::fixed_width_column_wrapper{{31, 351, 351}, {1, 1, 0}}; + auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}}; + auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false, false}, {1, 1, 0}}; + auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; + + auto list_offsets_column_2 = + cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}.release(); + auto num_list_rows_2 = list_offsets_column_2->size() - 1; + + auto list_col_2 = cudf::make_lists_column( + num_list_rows_2, std::move(list_offsets_column_2), struct_2_2.release(), 0, {}); + + auto table_2 = table_view({*list_col_2}); + + auto full_table = cudf::concatenate(std::vector({table_1, table_2})); + + cudf::io::table_input_metadata expected_metadata(table_1); + expected_metadata.column_metadata[0].set_name("family"); + expected_metadata.column_metadata[0].child(1).set_nullability(false); + expected_metadata.column_metadata[0].child(1).child(0).set_name("human?"); + expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars"); + expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age"); + + auto filepath = temp_env->get_temp_filepath("ChunkedListOfStruct.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + args.set_metadata(expected_metadata); + cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) +{ + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + + using lcw = cudf::test::lists_column_wrapper; + + // Table 1 =========================== + + // [] + // [NULL, 2, NULL] + // [4, 5] + // NULL + lcw land_1{{{}, {{1, 2, 3}, valids}, {4, 5}, {}}, valids2}; + + // [] + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8], []] + // [[]] + lcw flats_1{lcw{}, {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}, {}}, lcw{lcw{}}}; + + auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3, 1.1}}; + auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5, 31}}; + auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1, land_1, flats_1}; + auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false, false}}; + auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; + + auto list_offsets_column_1 = + cudf::test::fixed_width_column_wrapper{0, 2, 3, 4}.release(); + auto num_list_rows_1 = list_offsets_column_1->size() - 1; + + auto list_col_1 = cudf::make_lists_column( + num_list_rows_1, std::move(list_offsets_column_1), struct_2_1.release(), 0, {}); + + auto table_1 = table_view({*list_col_1}); + + // Table 2 =========================== + + // [] + // [7, 8, 9] + lcw land_2{{}, {7, 8, 9}}; + + // [[]] + // [[], [], []] + lcw flats_2{lcw{lcw{}}, lcw{lcw{}, lcw{}, lcw{}}}; + + auto weight_2 = cudf::test::fixed_width_column_wrapper{{-1.0, -1.0}}; + auto ages_2 = cudf::test::fixed_width_column_wrapper{{351, 351}, {1, 0}}; + auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2, land_2, flats_2}, {0, 1}}; + auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false}, {1, 0}}; + auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; + + auto list_offsets_column_2 = + cudf::test::fixed_width_column_wrapper{0, 1, 2}.release(); + auto num_list_rows_2 = list_offsets_column_2->size() - 1; + + auto list_col_2 = cudf::make_lists_column( + num_list_rows_2, std::move(list_offsets_column_2), struct_2_2.release(), 0, {}); + + auto table_2 = table_view({*list_col_2}); + + auto full_table = cudf::concatenate(std::vector({table_1, table_2})); + + cudf::io::table_input_metadata expected_metadata(table_1); + expected_metadata.column_metadata[0].set_name("family"); + expected_metadata.column_metadata[0].child(1).set_nullability(false); + expected_metadata.column_metadata[0].child(1).child(0).set_name("human?"); + expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars"); + expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age"); + expected_metadata.column_metadata[0].child(1).child(1).child(2).set_name("land_unit"); + expected_metadata.column_metadata[0].child(1).child(1).child(3).set_name("flats"); + + auto filepath = temp_env->get_temp_filepath("ListOfStructOfStructOfListOfList.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + args.set_metadata(expected_metadata); + cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); + + // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls. + auto result_parent_list = result.tbl->get_column(0); + auto result_struct_2 = result_parent_list.child(cudf::lists_column_view::child_column_index); + EXPECT_EQ(result_struct_2.nullable(), false); +} + +TEST_F(ParquetChunkedWriterTest, MismatchedTypes) +{ + srand(31337); + auto table1 = create_random_fixed_table(4, 4, true); + auto table2 = create_random_fixed_table(4, 4, true); + + auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer writer(args); + writer.write(*table1); + EXPECT_THROW(writer.write(*table2), cudf::logic_error); + writer.close(); +} + +TEST_F(ParquetChunkedWriterTest, ChunkedWriteAfterClosing) +{ + srand(31337); + auto table = create_random_fixed_table(4, 4, true); + + auto filepath = temp_env->get_temp_filepath("ChunkedWriteAfterClosing.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer writer(args); + writer.write(*table).close(); + EXPECT_THROW(writer.write(*table), cudf::logic_error); +} + +TEST_F(ParquetChunkedWriterTest, ReadingUnclosedFile) +{ + srand(31337); + auto table = create_random_fixed_table(4, 4, true); + + auto filepath = temp_env->get_temp_filepath("ReadingUnclosedFile.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer writer(args); + writer.write(*table); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); +} + +TEST_F(ParquetChunkedWriterTest, MismatchedStructure) +{ + srand(31337); + auto table1 = create_random_fixed_table(4, 4, true); + auto table2 = create_random_fixed_table(3, 4, true); + + auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer writer(args); + writer.write(*table1); + EXPECT_THROW(writer.write(*table2), cudf::logic_error); + writer.close(); +} + +TEST_F(ParquetChunkedWriterTest, MismatchedStructureList) +{ + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + + using lcw = cudf::test::lists_column_wrapper; + + // COL0 (mismatched depth) ==================== + // [NULL, 2, NULL] + // [] + // [4, 5] + // NULL + lcw col00{{{{1, 2, 3}, valids}, {}, {4, 5}, {}}, valids2}; + + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + lcw col01{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // COL2 (non-nested columns to test proper schema construction) + size_t num_rows = static_cast(col00).size(); + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + column_wrapper col10{seq_col0.begin(), seq_col0.end(), valids}; + column_wrapper col11{seq_col1.begin(), seq_col1.end(), valids2}; + + auto tbl0 = table_view({col00, col10}); + auto tbl1 = table_view({col01, col11}); + + auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer writer(args); + writer.write(tbl0); + EXPECT_THROW(writer.write(tbl1), cudf::logic_error); +} + +TEST_F(ParquetChunkedWriterTest, DifferentNullability) +{ + srand(31337); + auto table1 = create_random_fixed_table(5, 5, true); + auto table2 = create_random_fixed_table(5, 5, false); + + auto full_table = cudf::concatenate(std::vector({*table1, *table2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); +} + +TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct) +{ + // Struct, + // age:int + // > (nullable) + // > (non-nullable) + + // Table 1: is_human and struct_1 are non-nullable but should be nullable when read back. + auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; + auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; + auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1}; + auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false}}; + auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; + auto table_1 = cudf::table_view({struct_2_1}); + + // Table 2: struct_1 and is_human are nullable now so if we hadn't assumed worst case (nullable) + // when writing table_1, we would have wrong pages for it. + auto weight_2 = cudf::test::fixed_width_column_wrapper{{1.1, -1.0, -1.0}}; + auto ages_2 = cudf::test::fixed_width_column_wrapper{{31, 351, 351}, {1, 1, 0}}; + auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}}; + auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false, false}, {1, 1, 0}}; + auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; + auto table_2 = cudf::table_view({struct_2_2}); + + auto full_table = cudf::concatenate(std::vector({table_1, table_2})); + + cudf::io::table_input_metadata expected_metadata(table_1); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("human?"); + expected_metadata.column_metadata[0].child(1).set_name("particulars"); + expected_metadata.column_metadata[0].child(1).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(1).child(1).set_name("age"); + + auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + args.set_metadata(expected_metadata); + cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_F(ParquetChunkedWriterTest, ForcedNullability) +{ + srand(31337); + auto table1 = create_random_fixed_table(5, 5, false); + auto table2 = create_random_fixed_table(5, 5, false); + + auto full_table = cudf::concatenate(std::vector({*table1, *table2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet"); + + cudf::io::table_input_metadata metadata(*table1); + + // In the absence of prescribed per-column nullability in metadata, the writer assumes the worst + // and considers all columns nullable. However cudf::concatenate will not force nulls in case no + // columns are nullable. To get the expected result, we tell the writer the nullability of all + // columns in advance. + for (auto& col_meta : metadata.column_metadata) { + col_meta.set_nullability(false); + } + + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}) + .metadata(std::move(metadata)); + cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); +} + +TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList) +{ + srand(31337); + + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + + using lcw = cudf::test::lists_column_wrapper; + + // COL0 ==================== + // [1, 2, 3] + // [] + // [4, 5] + // NULL + lcw col00{{{1, 2, 3}, {}, {4, 5}, {}}, valids2}; + + // [7] + // [] + // [8, 9, 10, 11] + // NULL + lcw col01{{{7}, {}, {8, 9, 10, 11}, {}}, valids2}; + + // COL1 (non-nested columns to test proper schema construction) + size_t num_rows = static_cast(col00).size(); + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + column_wrapper col10{seq_col0.begin(), seq_col0.end(), valids}; + column_wrapper col11{seq_col1.begin(), seq_col1.end(), valids2}; + + auto table1 = table_view({col00, col10}); + auto table2 = table_view({col01, col11}); + + auto full_table = cudf::concatenate(std::vector({table1, table2})); + + cudf::io::table_input_metadata metadata(table1); + metadata.column_metadata[0].set_nullability(true); // List is nullable at first (root) level + metadata.column_metadata[0].child(1).set_nullability( + false); // non-nullable at second (leaf) level + metadata.column_metadata[1].set_nullability(true); + + auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet"); + + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}) + .metadata(std::move(metadata)); + cudf::io::parquet_chunked_writer(args).write(table1).write(table2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); +} + +TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct) +{ + // Struct, + // age:int + // > (nullable) + // > (non-nullable) + + // Table 1: is_human and struct_2 are non-nullable and should stay that way when read back. + auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; + auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; + auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1}; + auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false}}; + auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; + auto table_1 = cudf::table_view({struct_2_1}); + + auto weight_2 = cudf::test::fixed_width_column_wrapper{{1.1, -1.0, -1.0}}; + auto ages_2 = cudf::test::fixed_width_column_wrapper{{31, 351, 351}, {1, 1, 0}}; + auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}}; + auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false, false}}; + auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; + auto table_2 = cudf::table_view({struct_2_2}); + + auto full_table = cudf::concatenate(std::vector({table_1, table_2})); + + cudf::io::table_input_metadata expected_metadata(table_1); + expected_metadata.column_metadata[0].set_name("being").set_nullability(false); + expected_metadata.column_metadata[0].child(0).set_name("human?").set_nullability(false); + expected_metadata.column_metadata[0].child(1).set_name("particulars"); + expected_metadata.column_metadata[0].child(1).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(1).child(1).set_name("age"); + + auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + args.set_metadata(expected_metadata); + cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_F(ParquetChunkedWriterTest, ReadRowGroups) +{ + srand(31337); + auto table1 = create_random_fixed_table(5, 5, true); + auto table2 = create_random_fixed_table(5, 5, true); + + auto full_table = cudf::concatenate(std::vector({*table2, *table1, *table2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + { + cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); + } + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .row_groups({{1, 0, 1}}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); +} + +TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError) +{ + srand(31337); + auto table1 = create_random_fixed_table(5, 5, true); + + auto filepath = temp_env->get_temp_filepath("ChunkedRowGroupsError.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(*table1); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).row_groups({{0, 1}}); + EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); + read_opts.set_row_groups({{-1}}); + EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); + read_opts.set_row_groups({{0}, {0}}); + EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); +} + +TEST_F(ParquetChunkedWriterTest, RowGroupPageSizeMatch) +{ + std::vector out_buffer; + + auto options = cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) + .row_group_size_bytes(128 * 1024) + .max_page_size_bytes(512 * 1024) + .row_group_size_rows(10000) + .max_page_size_rows(20000) + .build(); + EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); + EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); +} + +TEST_F(ParquetChunkedWriterTest, CompStats) +{ + auto table = create_random_fixed_table(1, 100000, true); + + auto const stats = std::make_shared(); + + std::vector unused_buffer; + cudf::io::chunked_parquet_writer_options opts = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}) + .compression_statistics(stats); + cudf::io::parquet_chunked_writer(opts).write(*table); + + EXPECT_NE(stats->num_compressed_bytes(), 0); + EXPECT_EQ(stats->num_failed_bytes(), 0); + EXPECT_EQ(stats->num_skipped_bytes(), 0); + EXPECT_FALSE(std::isnan(stats->compression_ratio())); + + auto const single_table_comp_stats = *stats; + cudf::io::parquet_chunked_writer(opts).write(*table); + + EXPECT_EQ(stats->compression_ratio(), single_table_comp_stats.compression_ratio()); + EXPECT_EQ(stats->num_compressed_bytes(), 2 * single_table_comp_stats.num_compressed_bytes()); + + EXPECT_EQ(stats->num_failed_bytes(), 0); + EXPECT_EQ(stats->num_skipped_bytes(), 0); +} + +TEST_F(ParquetChunkedWriterTest, CompStatsEmptyTable) +{ + auto table_no_rows = create_random_fixed_table(20, 0, false); + + auto const stats = std::make_shared(); + + std::vector unused_buffer; + cudf::io::chunked_parquet_writer_options opts = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}) + .compression_statistics(stats); + cudf::io::parquet_chunked_writer(opts).write(*table_no_rows); + + expect_compression_stats_empty(stats); +} + +TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes); + +TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) +{ + // write out two 31 row tables and make sure they get + // read back with all their validity bits in the right place + + using T = TypeParam; + + int num_els = 31; + std::vector> cols; + + bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, true}; + T c1a[num_els]; + std::fill(c1a, c1a + num_els, static_cast(5)); + T c1b[num_els]; + std::fill(c1b, c1b + num_els, static_cast(6)); + column_wrapper c1a_w(c1a, c1a + num_els, mask); + column_wrapper c1b_w(c1b, c1b + num_els, mask); + cols.push_back(c1a_w.release()); + cols.push_back(c1b_w.release()); + cudf::table tbl1(std::move(cols)); + + T c2a[num_els]; + std::fill(c2a, c2a + num_els, static_cast(8)); + T c2b[num_els]; + std::fill(c2b, c2b + num_els, static_cast(9)); + column_wrapper c2a_w(c2a, c2a + num_els, mask); + column_wrapper c2b_w(c2b, c2b + num_els, mask); + cols.push_back(c2a_w.release()); + cols.push_back(c2b_w.release()); + cudf::table tbl2(std::move(cols)); + + auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) +{ + // write out two 33 row tables and make sure they get + // read back with all their validity bits in the right place + + using T = TypeParam; + + int num_els = 33; + std::vector> cols; + + bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; + + T c1a[num_els]; + std::fill(c1a, c1a + num_els, static_cast(5)); + T c1b[num_els]; + std::fill(c1b, c1b + num_els, static_cast(6)); + column_wrapper c1a_w(c1a, c1a + num_els, mask); + column_wrapper c1b_w(c1b, c1b + num_els, mask); + cols.push_back(c1a_w.release()); + cols.push_back(c1b_w.release()); + cudf::table tbl1(std::move(cols)); + + T c2a[num_els]; + std::fill(c2a, c2a + num_els, static_cast(8)); + T c2b[num_els]; + std::fill(c2b, c2b + num_els, static_cast(9)); + column_wrapper c2a_w(c2a, c2a + num_els, mask); + column_wrapper c2b_w(c2b, c2b + num_els, mask); + cols.push_back(c2a_w.release()); + cols.push_back(c2b_w.release()); + cudf::table tbl2(std::move(cols)); + + auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); + + auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet"); + cudf::io::chunked_parquet_writer_options args = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp new file mode 100644 index 00000000000..b64cd230bc6 --- /dev/null +++ b/cpp/tests/io/parquet_common.cpp @@ -0,0 +1,798 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_common.hpp" + +#include + +// Global environment for temporary files +cudf::test::TempDirTestEnvironment* const temp_env = + static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +template +std::unique_ptr create_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + bool include_validity, + Elements elements) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + std::vector> src_cols(num_columns); + for (int idx = 0; idx < num_columns; idx++) { + if (include_validity) { + src_cols[idx] = + cudf::test::fixed_width_column_wrapper(elements, elements + num_rows, valids); + } else { + src_cols[idx] = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); + } + } + std::vector> columns(num_columns); + std::transform(src_cols.begin(), + src_cols.end(), + columns.begin(), + [](cudf::test::fixed_width_column_wrapper& in) { + auto ret = in.release(); + // pre-cache the null count + [[maybe_unused]] auto const nulls = ret->has_nulls(); + return ret; + }); + return std::make_unique(std::move(columns)); +} + +template +std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + bool include_validity) +{ + auto rand_elements = + cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); }); + return create_fixed_table(num_columns, num_rows, include_validity, rand_elements); +} + +template +std::unique_ptr create_compressible_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + cudf::size_type period, + bool include_validity) +{ + auto compressible_elements = + cudf::detail::make_counting_transform_iterator(0, [period](T i) { return i / period; }); + return create_fixed_table(num_columns, num_rows, include_validity, compressible_elements); +} + +template std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + bool include_validity); +template std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + bool include_validity); + +template std::unique_ptr create_compressible_fixed_table( + cudf::size_type num_columns, + cudf::size_type num_rows, + cudf::size_type period, + bool include_validity); + +template std::unique_ptr create_compressible_fixed_table( + cudf::size_type num_columns, + cudf::size_type num_rows, + cudf::size_type period, + bool include_validity); + +// this function replicates the "list_gen" function in +// python/cudf/cudf/tests/test_parquet.py +template +std::unique_ptr make_parquet_list_list_col( + int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0 ? 1 : 0; }); + + // root list + std::vector row_offsets(num_rows + 1); + int row_offset_count = 0; + { + int offset = 0; + for (int idx = 0; idx < (num_rows) + 1; idx++) { + row_offsets[row_offset_count] = offset; + if (!include_validity || valids[idx]) { offset += lists_per_row; } + row_offset_count++; + } + } + cudf::test::fixed_width_column_wrapper offsets(row_offsets.begin(), + row_offsets.begin() + row_offset_count); + + // child list + std::vector child_row_offsets((num_rows * lists_per_row) + 1); + int child_row_offset_count = 0; + { + int offset = 0; + for (int idx = 0; idx < (num_rows * lists_per_row); idx++) { + int row_index = idx / lists_per_row; + if (include_validity && !valids[row_index]) { continue; } + + child_row_offsets[child_row_offset_count] = offset; + offset += list_size; + child_row_offset_count++; + } + child_row_offsets[child_row_offset_count++] = offset; + } + cudf::test::fixed_width_column_wrapper child_offsets( + child_row_offsets.begin(), child_row_offsets.begin() + child_row_offset_count); + + // child values + std::vector child_values(num_rows * lists_per_row * list_size); + T first_child_value_index = skip_rows * lists_per_row * list_size; + int child_value_count = 0; + { + for (int idx = 0; idx < (num_rows * lists_per_row * list_size); idx++) { + int row_index = idx / (lists_per_row * list_size); + + int val = first_child_value_index; + first_child_value_index++; + + if (include_validity && !valids[row_index]) { continue; } + + child_values[child_value_count] = val; + child_value_count++; + } + } + // validity by value instead of index + auto valids2 = cudf::detail::make_counting_transform_iterator( + 0, [list_size](auto i) { return (i % list_size) % 2 == 0 ? 1 : 0; }); + auto child_data = include_validity + ? cudf::test::fixed_width_column_wrapper( + child_values.begin(), child_values.begin() + child_value_count, valids2) + : cudf::test::fixed_width_column_wrapper( + child_values.begin(), child_values.begin() + child_value_count); + + int child_offsets_size = static_cast(child_offsets).size() - 1; + auto child = cudf::make_lists_column( + child_offsets_size, child_offsets.release(), child_data.release(), 0, rmm::device_buffer{}); + + int offsets_size = static_cast(offsets).size() - 1; + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + offsets_size); + return include_validity + ? cudf::make_lists_column( + offsets_size, offsets.release(), std::move(child), null_count, std::move(null_mask)) + : cudf::make_lists_column( + offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{}); +} + +template std::unique_ptr make_parquet_list_list_col( + int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity); + +template +std::vector random_values(size_t size) +{ + std::vector values(size); + + using T1 = T; + using uniform_distribution = + typename std::conditional_t, + std::bernoulli_distribution, + std::conditional_t, + std::uniform_real_distribution, + std::uniform_int_distribution>>; + + static constexpr auto seed = 0xf00d; + static std::mt19937 engine{seed}; + static uniform_distribution dist{}; + std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; }); + + return values; +} + +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); +template std::vector random_values(size_t size); + +// given a datasource pointing to a parquet file, read the footer +// of the file to populate the FileMetaData pointed to by file_meta_data. +// throws cudf::logic_error if the file or metadata is invalid. +void read_footer(std::unique_ptr const& source, + cudf::io::parquet::detail::FileMetaData* file_meta_data) +{ + constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); + constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); + + auto const len = source->size(); + auto const header_buffer = source->host_read(0, header_len); + auto const header = + reinterpret_cast(header_buffer->data()); + auto const ender_buffer = source->host_read(len - ender_len, ender_len); + auto const ender = + reinterpret_cast(ender_buffer->data()); + + // checks for valid header, footer, and file length + ASSERT_GT(len, header_len + ender_len); + ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && + ender->magic == cudf::io::parquet::detail::parquet_magic); + ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); + + // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" + // seek backwards from the end of the file (footer_length + 8 bytes of ender) + auto const footer_buffer = + source->host_read(len - ender->footer_len - ender_len, ender->footer_len); + cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); + + cp.read(file_meta_data); +} + +// returns the number of bits used for dictionary encoding data at the given page location. +// this assumes the data is uncompressed. +// throws cudf::logic_error if the page_loc data is invalid. +int read_dict_bits(std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) +{ + CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); + CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); + + cudf::io::parquet::detail::PageHeader page_hdr; + auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cp.read(&page_hdr); + + // cp should be pointing at the start of page data now. the first byte + // should be the encoding bit size + return cp.getb(); +} + +// read column index from datasource at location indicated by chunk, +// parse and return as a ColumnIndex struct. +// throws cudf::logic_error if the chunk data is invalid. +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) +{ + CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); + CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); + + cudf::io::parquet::detail::ColumnIndex colidx; + auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); + cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); + cp.read(&colidx); + return colidx; +} + +// read offset index from datasource at location indicated by chunk, +// parse and return as an OffsetIndex struct. +// throws cudf::logic_error if the chunk data is invalid. +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) +{ + CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); + CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); + + cudf::io::parquet::detail::OffsetIndex offidx; + auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); + cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); + cp.read(&offidx); + return offidx; +} + +// Return as a Statistics from the column chunk +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk) +{ + return chunk.meta_data.statistics; +} + +// read page header from datasource at location indicated by page_loc, +// parse and return as a PageHeader struct. +// throws cudf::logic_error if the page_loc data is invalid. +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) +{ + CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); + CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); + + cudf::io::parquet::detail::PageHeader page_hdr; + auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cp.read(&page_hdr); + return page_hdr; +} + +// ============================================================================= +// ---- test data for stats sort order tests + +namespace testdata { +// ----- most numerics. scale by 100 so all values fit in a single byte + +template +std::enable_if_t && !std::is_same_v, + cudf::test::fixed_width_column_wrapper> +ascending() +{ + int start = std::is_signed_v ? -num_ordered_rows / 2 : 0; + auto elements = + cudf::detail::make_counting_transform_iterator(start, [](auto i) { return i / 100; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t && !std::is_same_v, + cudf::test::fixed_width_column_wrapper> +descending() +{ + if (std::is_signed_v) { + auto elements = cudf::detail::make_counting_transform_iterator(-num_ordered_rows / 2, + [](auto i) { return -i / 100; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); + } else { + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (num_ordered_rows - i) / 100; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); + } +} + +template +std::enable_if_t && !std::is_same_v, + cudf::test::fixed_width_column_wrapper> +unordered() +{ + if (std::is_signed_v) { + auto elements = cudf::detail::make_counting_transform_iterator( + -num_ordered_rows / 2, [](auto i) { return (i % 2 ? i : -i) / 100; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); + } else { + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 ? i : num_ordered_rows - i) / 100; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); + } +} + +// ----- bool + +template +std::enable_if_t, cudf::test::fixed_width_column_wrapper> ascending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i >= num_ordered_rows / 2; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t, cudf::test::fixed_width_column_wrapper> descending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i < num_ordered_rows / 2; }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t, cudf::test::fixed_width_column_wrapper> unordered() +{ + auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + switch (i / page_size_for_ordered_tests) { + case 0: return true; + case 1: return false; + case 2: return true; + default: return false; + } + }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +// ----- fixed point types + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + -num_ordered_rows / 2, [](auto i) { return T(i, numeric::scale_type{0}); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + -num_ordered_rows / 2, [](auto i) { return T(-i, numeric::scale_type{0}); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + -num_ordered_rows / 2, [](auto i) { return T(i % 2 ? i : -i, numeric::scale_type{0}); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +// ----- chrono types +// ----- timstamp + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return T(typename T::duration(i)); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return T(typename T::duration(num_ordered_rows - i)); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return T(typename T::duration(i % 2 ? i : num_ordered_rows - i)); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +// ----- duration + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending() +{ + auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return T(i); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return T(num_ordered_rows - i); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered() +{ + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return T(i % 2 ? i : num_ordered_rows - i); }); + return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); +} + +// ----- string_view + +template +std::enable_if_t, cudf::test::strings_column_wrapper> +ascending() +{ + char buf[10]; + auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { + sprintf(buf, "%09d", i); + return std::string(buf); + }); + return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t, cudf::test::strings_column_wrapper> +descending() +{ + char buf[10]; + auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { + sprintf(buf, "%09d", num_ordered_rows - i); + return std::string(buf); + }); + return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); +} + +template +std::enable_if_t, cudf::test::strings_column_wrapper> +unordered() +{ + char buf[10]; + auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { + sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); + return std::string(buf); + }); + return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); +} + +#define FIXED_WIDTH_ORDERED_DATA(type) \ + template cudf::test::fixed_width_column_wrapper ascending(); \ + template cudf::test::fixed_width_column_wrapper descending(); \ + template cudf::test::fixed_width_column_wrapper unordered() + +FIXED_WIDTH_ORDERED_DATA(bool); +FIXED_WIDTH_ORDERED_DATA(int8_t); +FIXED_WIDTH_ORDERED_DATA(int16_t); +FIXED_WIDTH_ORDERED_DATA(int32_t); +FIXED_WIDTH_ORDERED_DATA(int64_t); +FIXED_WIDTH_ORDERED_DATA(uint8_t); +FIXED_WIDTH_ORDERED_DATA(uint16_t); +FIXED_WIDTH_ORDERED_DATA(uint32_t); +FIXED_WIDTH_ORDERED_DATA(uint64_t); +FIXED_WIDTH_ORDERED_DATA(float); +FIXED_WIDTH_ORDERED_DATA(double); +FIXED_WIDTH_ORDERED_DATA(cudf::duration_D); +FIXED_WIDTH_ORDERED_DATA(cudf::duration_s); +FIXED_WIDTH_ORDERED_DATA(cudf::duration_ms); +FIXED_WIDTH_ORDERED_DATA(cudf::duration_us); +FIXED_WIDTH_ORDERED_DATA(cudf::duration_ns); +FIXED_WIDTH_ORDERED_DATA(cudf::timestamp_D); +FIXED_WIDTH_ORDERED_DATA(cudf::timestamp_s); +FIXED_WIDTH_ORDERED_DATA(cudf::timestamp_ms); +FIXED_WIDTH_ORDERED_DATA(cudf::timestamp_us); +FIXED_WIDTH_ORDERED_DATA(cudf::timestamp_ns); +FIXED_WIDTH_ORDERED_DATA(numeric::decimal32); +FIXED_WIDTH_ORDERED_DATA(numeric::decimal64); +FIXED_WIDTH_ORDERED_DATA(numeric::decimal128); + +template cudf::test::strings_column_wrapper ascending(); +template cudf::test::strings_column_wrapper descending(); +template cudf::test::strings_column_wrapper unordered(); + +} // namespace testdata + +template +std::unique_ptr make_parquet_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + bool include_validity) +{ + std::vector row_sizes(num_rows); + + auto const min_values_per_row = include_validity ? 0 : 1; + std::uniform_int_distribution dist{min_values_per_row, max_vals_per_row}; + std::generate_n(row_sizes.begin(), num_rows, [&]() { return cudf::size_type{dist(engine)}; }); + + std::vector offsets(num_rows + 1); + std::exclusive_scan(row_sizes.begin(), row_sizes.end(), offsets.begin(), 0); + offsets[num_rows] = offsets[num_rows - 1] + row_sizes.back(); + + std::vector values = random_values(offsets[num_rows]); + cudf::test::fixed_width_column_wrapper offsets_col(offsets.begin(), + offsets.end()); + + if (include_validity) { + auto valids = random_validity(engine); + auto values_col = + cudf::test::fixed_width_column_wrapper(values.begin(), values.end(), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto col = cudf::make_lists_column( + num_rows, offsets_col.release(), values_col.release(), null_count, std::move(null_mask)); + return cudf::purge_nonempty_nulls(*col); + } else { + auto values_col = cudf::test::fixed_width_column_wrapper(values.begin(), values.end()); + return cudf::make_lists_column(num_rows, + offsets_col.release(), + values_col.release(), + 0, + cudf::create_null_mask(num_rows, cudf::mask_state::ALL_VALID)); + } +} + +template std::unique_ptr make_parquet_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + bool include_validity); +template std::unique_ptr make_parquet_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + bool include_validity); +template std::unique_ptr make_parquet_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + bool include_validity); +template std::unique_ptr make_parquet_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + bool include_validity); + +std::vector string_values(std::mt19937& engine, int num_rows, int max_string_len) +{ + static std::uniform_int_distribution char_dist{'a', 'z'}; + static std::uniform_int_distribution strlen_dist{1, max_string_len}; + + std::vector values(num_rows); + std::generate_n(values.begin(), values.size(), [&]() { + int str_len = strlen_dist(engine); + std::string res = ""; + for (int i = 0; i < str_len; i++) { + res += char_dist(engine); + } + return res; + }); + + return values; +} + +// make a random list column, with random string lengths of 0..max_string_len, +// and up to max_vals_per_row strings in each list. +std::unique_ptr make_parquet_string_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + int max_string_len, + bool include_validity) +{ + auto const range_min = include_validity ? 0 : 1; + + std::uniform_int_distribution dist{range_min, max_vals_per_row}; + + std::vector row_sizes(num_rows); + std::generate_n(row_sizes.begin(), num_rows, [&]() { return cudf::size_type{dist(engine)}; }); + + std::vector offsets(num_rows + 1); + std::exclusive_scan(row_sizes.begin(), row_sizes.end(), offsets.begin(), 0); + offsets[num_rows] = offsets[num_rows - 1] + row_sizes.back(); + + std::uniform_int_distribution strlen_dist{range_min, max_string_len}; + auto const values = string_values(engine, offsets[num_rows], max_string_len); + + cudf::test::fixed_width_column_wrapper offsets_col(offsets.begin(), + offsets.end()); + + if (include_validity) { + auto valids = random_validity(engine); + auto values_col = cudf::test::strings_column_wrapper(values.begin(), values.end(), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto col = cudf::make_lists_column( + num_rows, offsets_col.release(), values_col.release(), null_count, std::move(null_mask)); + return cudf::purge_nonempty_nulls(*col); + } else { + auto values_col = cudf::test::strings_column_wrapper(values.begin(), values.end()); + return cudf::make_lists_column(num_rows, + offsets_col.release(), + values_col.release(), + 0, + cudf::create_null_mask(num_rows, cudf::mask_state::ALL_VALID)); + } +} + +template +std::pair create_parquet_typed_with_stats(std::string const& filename) +{ + auto col0 = testdata::ascending(); + auto col1 = testdata::descending(); + auto col2 = testdata::unordered(); + + auto const written_table = table_view{{col0, col1, col2}}; + auto const filepath = temp_env->get_temp_filepath("FilterTyped.parquet"); + { + cudf::io::table_input_metadata expected_metadata(written_table); + expected_metadata.column_metadata[0].set_name("col0"); + expected_metadata.column_metadata[1].set_name("col1"); + expected_metadata.column_metadata[2].set_name("col2"); + + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) + .metadata(std::move(expected_metadata)) + .row_group_size_rows(8000); + cudf::io::write_parquet(out_opts); + } + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return std::pair{cudf::table{std::move(columns)}, filepath}; +} + +#define TYPED_WITH_STATS(type) \ + template std::pair create_parquet_typed_with_stats( \ + std::string const& filename) + +TYPED_WITH_STATS(cudf::string_view); +TYPED_WITH_STATS(bool); +TYPED_WITH_STATS(int8_t); +TYPED_WITH_STATS(int16_t); +TYPED_WITH_STATS(int32_t); +TYPED_WITH_STATS(int64_t); +TYPED_WITH_STATS(uint8_t); +TYPED_WITH_STATS(uint16_t); +TYPED_WITH_STATS(uint32_t); +TYPED_WITH_STATS(uint64_t); +TYPED_WITH_STATS(float); +TYPED_WITH_STATS(double); +// TYPED_WITH_STATS(cudf::duration_D); +// TYPED_WITH_STATS(cudf::duration_s); +TYPED_WITH_STATS(cudf::duration_ms); +TYPED_WITH_STATS(cudf::duration_us); +TYPED_WITH_STATS(cudf::duration_ns); +TYPED_WITH_STATS(cudf::timestamp_D); +// TYPED_WITH_STATS(cudf::timestamp_s); +TYPED_WITH_STATS(cudf::timestamp_ms); +TYPED_WITH_STATS(cudf::timestamp_us); +TYPED_WITH_STATS(cudf::timestamp_ns); +TYPED_WITH_STATS(numeric::decimal32); +TYPED_WITH_STATS(numeric::decimal64); +TYPED_WITH_STATS(numeric::decimal128); + +// utility functions for column index tests + +// compare two values. return -1 if v1 < v2, +// 0 if v1 == v2, and 1 if v1 > v2. +template +int32_t compare(T& v1, T& v2) +{ + return (v1 > v2) - (v1 < v2); +} + +// compare two binary statistics blobs based on their physical +// and converted types. returns -1 if v1 < v2, 0 if v1 == v2, and +// 1 if v1 > v2. +int32_t compare_binary(std::vector const& v1, + std::vector const& v2, + cudf::io::parquet::detail::Type ptype, + thrust::optional const& ctype) +{ + auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); + switch (ptype) { + case cudf::io::parquet::detail::INT32: + switch (ctype_val) { + case cudf::io::parquet::detail::UINT_8: + case cudf::io::parquet::detail::UINT_16: + case cudf::io::parquet::detail::UINT_32: + return compare(*(reinterpret_cast(v1.data())), + *(reinterpret_cast(v2.data()))); + default: + return compare(*(reinterpret_cast(v1.data())), + *(reinterpret_cast(v2.data()))); + } + + case cudf::io::parquet::detail::INT64: + if (ctype_val == cudf::io::parquet::detail::UINT_64) { + return compare(*(reinterpret_cast(v1.data())), + *(reinterpret_cast(v2.data()))); + } + return compare(*(reinterpret_cast(v1.data())), + *(reinterpret_cast(v2.data()))); + + case cudf::io::parquet::detail::FLOAT: + return compare(*(reinterpret_cast(v1.data())), + *(reinterpret_cast(v2.data()))); + + case cudf::io::parquet::detail::DOUBLE: + return compare(*(reinterpret_cast(v1.data())), + *(reinterpret_cast(v2.data()))); + + case cudf::io::parquet::detail::BYTE_ARRAY: { + int32_t v1sz = v1.size(); + int32_t v2sz = v2.size(); + int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); + if (ret != 0 or v1sz == v2sz) { return ret; } + return v1sz - v2sz; + } + + default: CUDF_FAIL("Invalid type in compare_binary"); + } + + return 0; +} + +void expect_compression_stats_empty(std::shared_ptr stats) +{ + EXPECT_EQ(stats->num_compressed_bytes(), 0); + EXPECT_EQ(stats->num_failed_bytes(), 0); + EXPECT_EQ(stats->num_skipped_bytes(), 0); + EXPECT_TRUE(std::isnan(stats->compression_ratio())); +} diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp new file mode 100644 index 00000000000..59ee85444f2 --- /dev/null +++ b/cpp/tests/io/parquet_common.hpp @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +template +using column_wrapper = + typename std::conditional, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>::type; +using column = cudf::column; +using table = cudf::table; +using table_view = cudf::table_view; + +// Global environment for temporary files +extern cudf::test::TempDirTestEnvironment* const temp_env; + +// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 +using SupportedTypes = cudf::test::Types; + +using ComparableAndFixedTypes = + cudf::test::Concat; + +using SupportedTimestampTypes = + cudf::test::Types; + +using ByteLikeTypes = cudf::test::Types; + +// These chrono types are not supported because parquet writer does not have a type to represent +// them. +using UnsupportedChronoTypes = + cudf::test::Types; +// Also fixed point types unsupported, because AST does not support them yet. +using SupportedTestTypes = cudf::test::RemoveIf, + cudf::test::ComparableTypes>; + +// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly. +// see definition of UnsupportedChronoTypes above. +using DeltaDecimalTypes = cudf::test::Types; +using DeltaBinaryTypes = + cudf::test::Concat; +using SupportedDeltaTestTypes = + cudf::test::RemoveIf, DeltaBinaryTypes>; + +////////////////////////////////////////////////////////////////////// +// Test fixtures + +// Base test fixture for tests +struct ParquetWriterTest : public cudf::test::BaseFixture {}; + +// Base test fixture for tests +struct ParquetReaderTest : public cudf::test::BaseFixture {}; + +//////////////////////////////////////////////////////////////////// + +// Generates a vector of uniform random values of type T +template +std::vector random_values(size_t size); + +template +std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + bool include_validity); + +template +std::unique_ptr create_compressible_fixed_table(cudf::size_type num_columns, + cudf::size_type num_rows, + cudf::size_type period, + bool include_validity); + +// this function replicates the "list_gen" function in +// python/cudf/cudf/tests/test_parquet.py +template +std::unique_ptr make_parquet_list_list_col( + int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity); + +// given a datasource pointing to a parquet file, read the footer +// of the file to populate the FileMetaData pointed to by file_meta_data. +// throws cudf::logic_error if the file or metadata is invalid. +void read_footer(std::unique_ptr const& source, + cudf::io::parquet::detail::FileMetaData* file_meta_data); + +// returns the number of bits used for dictionary encoding data at the given page location. +// this assumes the data is uncompressed. +// throws cudf::logic_error if the page_loc data is invalid. +int read_dict_bits(std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc); + +// read column index from datasource at location indicated by chunk, +// parse and return as a ColumnIndex struct. +// throws cudf::logic_error if the chunk data is invalid. +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk); + +// read offset index from datasource at location indicated by chunk, +// parse and return as an OffsetIndex struct. +// throws cudf::logic_error if the chunk data is invalid. +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk); + +// Return as a Statistics from the column chunk +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk); + +// read page header from datasource at location indicated by page_loc, +// parse and return as a PageHeader struct. +// throws cudf::logic_error if the page_loc data is invalid. +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc); + +// make a random validity iterator +inline auto random_validity(std::mt19937& engine) +{ + static std::bernoulli_distribution bn(0.7f); + return cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(engine); }); +} + +// make a random list column +template +std::unique_ptr make_parquet_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + bool include_validity); + +// return vector of random strings +std::vector string_values(std::mt19937& engine, int num_rows, int max_string_len); + +// make a random list column, with random string lengths of 0..max_string_len, +// and up to max_vals_per_row strings in each list. +std::unique_ptr make_parquet_string_list_col(std::mt19937& engine, + int num_rows, + int max_vals_per_row, + int max_string_len, + bool include_validity); + +template +std::pair create_parquet_typed_with_stats(std::string const& filename); + +int32_t compare_binary(std::vector const& v1, + std::vector const& v2, + cudf::io::parquet::detail::Type ptype, + thrust::optional const& ctype); + +void expect_compression_stats_empty(std::shared_ptr stats); + +// ============================================================================= +// ---- test data for stats sort order tests +// need at least 3 pages, and min page count is 5000, so need at least 15000 values. +// use 20000 to be safe. +static constexpr int num_ordered_rows = 20000; +static constexpr int page_size_for_ordered_tests = 5000; + +namespace testdata { + +// ----- most numerics +template +std::enable_if_t && !std::is_same_v, + cudf::test::fixed_width_column_wrapper> +ascending(); + +template +std::enable_if_t && !std::is_same_v, + cudf::test::fixed_width_column_wrapper> +descending(); + +template +std::enable_if_t && !std::is_same_v, + cudf::test::fixed_width_column_wrapper> +unordered(); + +// ----- bool + +template +std::enable_if_t, cudf::test::fixed_width_column_wrapper> ascending(); + +template +std::enable_if_t, cudf::test::fixed_width_column_wrapper> +descending(); + +template +std::enable_if_t, cudf::test::fixed_width_column_wrapper> unordered(); + +// ----- fixed point types + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending(); + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending(); + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered(); + +// ----- chrono types +// ----- timstamp + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending(); + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending(); + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered(); + +// ----- duration + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending(); + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending(); + +template +std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered(); + +// ----- string_view + +template +std::enable_if_t, cudf::test::strings_column_wrapper> +ascending(); + +template +std::enable_if_t, cudf::test::strings_column_wrapper> +descending(); + +template +std::enable_if_t, cudf::test::strings_column_wrapper> +unordered(); + +} // namespace testdata diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp new file mode 100644 index 00000000000..49b6b8fd259 --- /dev/null +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_common.hpp" + +#include +#include + +#include +#include +#include + +//////////////////////////////// +// delta encoding writer tests + +// Test fixture for delta encoding tests +template +struct ParquetWriterDeltaTest : public ParquetWriterTest {}; + +TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes); + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes) +{ + using T = TypeParam; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced) +{ + using T = TypeParam; + constexpr int num_rows = 4'000; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows}); + ASSERT_EQ(expected_slice[0].num_rows(), num_rows); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) +{ + using T = TypeParam; + + constexpr int num_slice = 4'000; + constexpr int num_rows = 32 * 1024; + + std::mt19937 gen(6542); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + // list + constexpr int vals_per_row = 4; + auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, + c1_offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper c1_vals( + values, values + (num_rows * vals_per_row), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto _c1 = cudf::make_lists_column( + num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask)); + auto c1 = cudf::purge_nonempty_nulls(*_c1); + + auto const expected = table_view{{*c1}}; + auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice}); + ASSERT_EQ(expected_slice[0].num_rows(), num_slice); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + +//////////////////////// +// sized tests + +// Base test fixture for size-parameterized tests +class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam {}; + +// test the allowed bit widths for dictionary encoding +INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest, + ParquetSizedTest, + testing::Range(1, 25), + testing::PrintToStringParamName()); + +TEST_P(ParquetSizedTest, DictionaryTest) +{ + unsigned int const cardinality = (1 << (GetParam() - 1)) + 1; + unsigned int const nrows = std::max(cardinality * 3 / 2, 3'000'000U); + + auto elements = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { + return "a unique string value suffixed with " + std::to_string(i % cardinality); + }); + auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("DictionaryTest.parquet"); + // set row group size so that there will be only one row group + // no compression so we can easily read page data + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::NONE) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS) + .row_group_size_rows(nrows) + .row_group_size_bytes(512 * 1024 * 1024); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + // make sure dictionary was used + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + auto used_dict = [&fmd]() { + for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { + return true; + } + } + return false; + }; + EXPECT_TRUE(used_dict()); + + // and check that the correct number of bits was used + auto const oi = read_offset_index(source, fmd.row_groups[0].columns[0]); + auto const nbits = read_dict_bits(source, oi.page_locations[0]); + EXPECT_EQ(nbits, GetParam()); +} + +/////////////////////// +// comparable tests + +// Typed test fixture for comparable type tests +template +struct ParquetWriterComparableTypeTest : public ParquetWriterTest { + auto type() { return cudf::data_type{cudf::type_to_id()}; } +}; + +TYPED_TEST_SUITE(ParquetWriterComparableTypeTest, ComparableAndFixedTypes); + +TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) +{ + using T = TypeParam; + + auto col0 = testdata::ascending(); + auto col1 = testdata::descending(); + auto col2 = testdata::unordered(); + + auto const expected = table_view{{col0, col1, col2}}; + + auto const filepath = temp_env->get_temp_filepath("ThreeColumnSorted.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_size_rows(page_size_for_ordered_tests) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + ASSERT_GT(fmd.row_groups.size(), 0); + + auto const& columns = fmd.row_groups[0].columns; + ASSERT_EQ(columns.size(), static_cast(expected.num_columns())); + + // now check that the boundary order for chunk 1 is ascending, + // chunk 2 is descending, and chunk 3 is unordered + cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { + cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; + + for (std::size_t i = 0; i < columns.size(); i++) { + auto const ci = read_column_index(source, columns[i]); + EXPECT_EQ(ci.boundary_order, expected_orders[i]); + } +} diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp new file mode 100644 index 00000000000..5cb05ac7011 --- /dev/null +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -0,0 +1,2340 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_common.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +TEST_F(ParquetReaderTest, UserBounds) +{ + // trying to read more rows than there are should result in + // receiving the properly capped # of rows + { + srand(31337); + auto expected = create_random_fixed_table(4, 4, false); + + auto filepath = temp_env->get_temp_filepath("TooManyRows.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); + cudf::io::write_parquet(args); + + // attempt to read more rows than there actually are + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).num_rows(16); + auto result = cudf::io::read_parquet(read_opts); + + // we should only get back 4 rows + EXPECT_EQ(result.tbl->view().column(0).size(), 4); + } + + // trying to read past the end of the # of actual rows should result + // in empty columns. + { + srand(31337); + auto expected = create_random_fixed_table(4, 4, false); + + auto filepath = temp_env->get_temp_filepath("PastBounds.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); + cudf::io::write_parquet(args); + + // attempt to read more rows than there actually are + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).skip_rows(4); + auto result = cudf::io::read_parquet(read_opts); + + // we should get empty columns back + EXPECT_EQ(result.tbl->view().num_columns(), 4); + EXPECT_EQ(result.tbl->view().column(0).size(), 0); + } + + // trying to read 0 rows should result in empty columns + { + srand(31337); + auto expected = create_random_fixed_table(4, 4, false); + + auto filepath = temp_env->get_temp_filepath("ZeroRows.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); + cudf::io::write_parquet(args); + + // attempt to read more rows than there actually are + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).num_rows(0); + auto result = cudf::io::read_parquet(read_opts); + + EXPECT_EQ(result.tbl->view().num_columns(), 4); + EXPECT_EQ(result.tbl->view().column(0).size(), 0); + } + + // trying to read 0 rows past the end of the # of actual rows should result + // in empty columns. + { + srand(31337); + auto expected = create_random_fixed_table(4, 4, false); + + auto filepath = temp_env->get_temp_filepath("ZeroRowsPastBounds.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); + cudf::io::write_parquet(args); + + // attempt to read more rows than there actually are + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .skip_rows(4) + .num_rows(0); + auto result = cudf::io::read_parquet(read_opts); + + // we should get empty columns back + EXPECT_EQ(result.tbl->view().num_columns(), 4); + EXPECT_EQ(result.tbl->view().column(0).size(), 0); + } +} + +TEST_F(ParquetReaderTest, UserBoundsWithNulls) +{ + // clang-format off + cudf::test::fixed_width_column_wrapper col{{1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7, 8,8,8,8,8,8,8,8} + ,{1,1,1,0,0,0,1,1, 1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,0,0, 1,0,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0}}; + // clang-format on + cudf::table_view tbl({col}); + auto filepath = temp_env->get_temp_filepath("UserBoundsWithNulls.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); + cudf::io::write_parquet(out_args); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ {-1, -1}, {1, 3}, {3, -1}, + {31, -1}, {32, -1}, {33, -1}, + {31, 5}, {32, 5}, {33, 5}, + {-1, 7}, {-1, 31}, {-1, 32}, {-1, 33}, + {62, -1}, {63, -1}, + {62, 2}, {63, 1}}; + // clang-format on + for (auto p : params) { + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf::io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(col, slice_indices); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); + } +} + +TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) +{ + constexpr int num_rows = 32 * 1024; + + std::mt19937 gen(6542); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + // int64 + cudf::test::fixed_width_column_wrapper c0(values, values + num_rows, valids); + + // list + constexpr int floats_per_row = 4; + auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; }); + cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, + c1_offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper c1_floats( + values, values + (num_rows * floats_per_row), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto _c1 = cudf::make_lists_column( + num_rows, c1_offsets.release(), c1_floats.release(), null_count, std::move(null_mask)); + auto c1 = cudf::purge_nonempty_nulls(*_c1); + + // list> + auto c2 = make_parquet_list_list_col(0, num_rows, 5, 8, true); + + // struct, int, float> + std::vector strings{ + "abc", "x", "bananas", "gpu", "minty", "backspace", "", "cayenne", "turbine", "soft"}; + std::uniform_int_distribution uni(0, strings.size() - 1); + auto string_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](cudf::size_type idx) { return strings[uni(gen)]; }); + constexpr int string_per_row = 3; + constexpr int num_string_rows = num_rows * string_per_row; + cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows}; + auto offset_iter = cudf::detail::make_counting_transform_iterator( + 0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; }); + cudf::test::fixed_width_column_wrapper offsets(offset_iter, + offset_iter + num_rows + 1); + + auto _c3_valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; }); + std::vector c3_valids(num_rows); + std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin()); + std::tie(null_mask, null_count) = cudf::test::detail::make_null_mask(valids, valids + num_rows); + auto _c3_list = cudf::make_lists_column( + num_rows, offsets.release(), string_col.release(), null_count, std::move(null_mask)); + auto c3_list = cudf::purge_nonempty_nulls(*_c3_list); + cudf::test::fixed_width_column_wrapper c3_ints(values, values + num_rows, valids); + cudf::test::fixed_width_column_wrapper c3_floats(values, values + num_rows, valids); + std::vector> c3_children; + c3_children.push_back(std::move(c3_list)); + c3_children.push_back(c3_ints.release()); + c3_children.push_back(c3_floats.release()); + cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids); + auto c3 = cudf::purge_nonempty_nulls(_c3); + + // write it out + cudf::table_view tbl({c0, *c1, *c2, *c3}); + auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsMixedTypes.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); + cudf::io::write_parquet(out_args); + + // read it back + std::vector> params{ + {-1, -1}, {0, num_rows}, {1, num_rows - 1}, {num_rows - 1, 1}, {517, 22000}}; + for (auto p : params) { + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf::io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? num_rows - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(tbl, slice_indices); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected[0]); + } +} + +TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge) +{ + constexpr int num_rows = 30 * 1000000; + + std::mt19937 gen(6747); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); + + // this file will have row groups of 1,000,000 each + cudf::table_view tbl({col}); + auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); + cudf::io::write_parquet(out_args); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1}, + {31, 1}, {32, 1}, {33, 1}, + // deliberately span some row group boundaries + {999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1}, + {16785678, 3}, {22996176, 31}, + {24001231, 17}, {29000001, 989999}, {29999999, 1} }; + // clang-format on + for (auto p : params) { + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf::io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(col, slice_indices); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); + } +} + +TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge) +{ + constexpr int num_rows = 5 * 1000000; + auto colp = make_parquet_list_list_col(0, num_rows, 5, 8, true); + cudf::column_view col = *colp; + + // this file will have row groups of 1,000,000 each + cudf::table_view tbl({col}); + auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); + cudf::io::write_parquet(out_args); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1}, + {31, 1}, {32, 1}, {33, 1}, + // deliberately span some row group boundaries + {999000, 1001}, {999000, 2000}, {2999999, 2}, + {1678567, 3}, {4299676, 31}, + {4001231, 17}, {1900000, 989999}, {4999999, 1} }; + // clang-format on + for (auto p : params) { + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf::io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + auto expected = cudf::slice(col, slice_indices); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); + } +} + +TEST_F(ParquetReaderTest, ReorderedColumns) +{ + { + auto a = cudf::test::strings_column_wrapper{{"a", "", "c"}, {true, false, true}}; + auto b = cudf::test::fixed_width_column_wrapper{1, 2, 3}; + + cudf::table_view tbl{{a, b}}; + auto filepath = temp_env->get_temp_filepath("ReorderedColumns.parquet"); + cudf::io::table_input_metadata md(tbl); + md.column_metadata[0].set_name("a"); + md.column_metadata[1].set_name("b"); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(md); + cudf::io::write_parquet(opts); + + // read them out of order + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .columns({"b", "a"}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), b); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a); + } + + { + auto a = cudf::test::fixed_width_column_wrapper{1, 2, 3}; + auto b = cudf::test::strings_column_wrapper{{"a", "", "c"}, {true, false, true}}; + + cudf::table_view tbl{{a, b}}; + auto filepath = temp_env->get_temp_filepath("ReorderedColumns2.parquet"); + cudf::io::table_input_metadata md(tbl); + md.column_metadata[0].set_name("a"); + md.column_metadata[1].set_name("b"); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(md); + cudf::io::write_parquet(opts); + + // read them out of order + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .columns({"b", "a"}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), b); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a); + } + + auto a = cudf::test::fixed_width_column_wrapper{1, 2, 3, 10, 20, 30}; + auto b = cudf::test::strings_column_wrapper{{"a", "", "c", "cats", "dogs", "owls"}, + {true, false, true, true, false, true}}; + auto c = cudf::test::fixed_width_column_wrapper{{15, 16, 17, 25, 26, 32}, + {false, true, true, true, true, false}}; + auto d = cudf::test::strings_column_wrapper{"ducks", "sheep", "cows", "fish", "birds", "ants"}; + + cudf::table_view tbl{{a, b, c, d}}; + auto filepath = temp_env->get_temp_filepath("ReorderedColumns3.parquet"); + cudf::io::table_input_metadata md(tbl); + md.column_metadata[0].set_name("a"); + md.column_metadata[1].set_name("b"); + md.column_metadata[2].set_name("c"); + md.column_metadata[3].set_name("d"); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl) + .metadata(std::move(md)); + cudf::io::write_parquet(opts); + + { + // read them out of order + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .columns({"d", "a", "b", "c"}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), d); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), b); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(3), c); + } + + { + // read them out of order + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .columns({"c", "d", "a", "b"}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), c); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), d); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), a); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(3), b); + } + + { + // read them out of order + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .columns({"d", "c", "b", "a"}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), d); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), c); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), b); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(3), a); + } +} + +TEST_F(ParquetReaderTest, SelectNestedColumn) +{ + // Struct>, + // flats:List> + // > + // > + + auto weights_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto ages_col = + cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col}, {1, 1, 1, 1, 0, 1}}; + + auto is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto struct_2 = + cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); + + auto input = table_view({*struct_2}); + + cudf::io::table_input_metadata input_metadata(input); + input_metadata.column_metadata[0].set_name("being"); + input_metadata.column_metadata[0].child(0).set_name("human?"); + input_metadata.column_metadata[0].child(1).set_name("particulars"); + input_metadata.column_metadata[0].child(1).child(0).set_name("weight"); + input_metadata.column_metadata[0].child(1).child(1).set_name("age"); + + auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, input) + .metadata(std::move(input_metadata)); + cudf::io::write_parquet(args); + + { // Test selecting a single leaf from the table + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)) + .columns({"being.particulars.age"}); + auto const result = cudf::io::read_parquet(read_args); + + auto expect_ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + auto expect_s_1 = cudf::test::structs_column_wrapper{{expect_ages_col}, {1, 1, 1, 1, 0, 1}}; + auto expect_s_2 = + cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release(); + auto expected = table_view({*expect_s_2}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("particulars"); + expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); + } + + { // Test selecting a non-leaf and expecting all hierarchy from that node onwards + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)) + .columns({"being.particulars"}); + auto const result = cudf::io::read_parquet(read_args); + + auto expected_weights_col = + cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto expected_ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto expected_s_1 = cudf::test::structs_column_wrapper{ + {expected_weights_col, expected_ages_col}, {1, 1, 1, 1, 0, 1}}; + + auto expect_s_2 = + cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release(); + auto expected = table_view({*expect_s_2}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("particulars"); + expected_metadata.column_metadata[0].child(0).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(0).child(1).set_name("age"); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); + } + + { // Test selecting struct children out of order + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)) + .columns({"being.particulars.age", "being.particulars.weight", "being.human?"}); + auto const result = cudf::io::read_parquet(read_args); + + auto expected_weights_col = + cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto expected_ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto expected_is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto expect_s_1 = cudf::test::structs_column_wrapper{{expected_ages_col, expected_weights_col}, + {1, 1, 1, 1, 0, 1}}; + + auto expect_s_2 = + cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col}, {0, 1, 1, 1, 1, 1}} + .release(); + + auto expected = table_view({*expect_s_2}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("particulars"); + expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); + expected_metadata.column_metadata[0].child(0).child(1).set_name("weight"); + expected_metadata.column_metadata[0].child(1).set_name("human?"); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); + } +} + +TEST_F(ParquetReaderTest, DecimalRead) +{ + { + /* We could add a dataset to include this file, but we don't want tests in cudf to have data. + This test is a temporary test until python gains the ability to write decimal, so we're + embedding + a parquet file directly into the code here to prevent issues with finding the file */ + unsigned char const decimals_parquet[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, + 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, + 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, + 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x68, 0x12, 0xdc, 0xff, 0xbd, 0x18, 0xfd, 0xff, 0x64, + 0x13, 0x80, 0x00, 0xb3, 0x5d, 0x62, 0x00, 0x90, 0x35, 0xa9, 0xff, 0xa2, 0xde, 0xe3, 0xff, + 0xe9, 0xbf, 0x96, 0xff, 0x1f, 0x8a, 0x98, 0xff, 0xb1, 0x50, 0x34, 0x00, 0x88, 0x24, 0x59, + 0x00, 0x2a, 0x33, 0xbe, 0xff, 0xd5, 0x16, 0xbc, 0xff, 0x13, 0x50, 0x8d, 0xff, 0xcb, 0x63, + 0x2d, 0x00, 0x80, 0x8f, 0xbe, 0xff, 0x82, 0x40, 0x10, 0x00, 0x84, 0x68, 0x70, 0xff, 0x9b, + 0x69, 0x78, 0x00, 0x14, 0x6c, 0x10, 0x00, 0x50, 0xd9, 0xe1, 0xff, 0xaa, 0xcd, 0x6a, 0x00, + 0xcf, 0xb1, 0x28, 0x00, 0x77, 0x57, 0x8d, 0x00, 0xee, 0x05, 0x79, 0x00, 0xf0, 0x15, 0xeb, + 0xff, 0x02, 0xe2, 0x06, 0x00, 0x87, 0x43, 0x86, 0x00, 0xf8, 0x2d, 0x2e, 0x00, 0xee, 0x2e, + 0x98, 0xff, 0x39, 0xcb, 0x4d, 0x00, 0x1e, 0x6b, 0xea, 0xff, 0x80, 0x8e, 0x6c, 0xff, 0x97, + 0x25, 0x26, 0x00, 0x4d, 0x0d, 0x0a, 0x00, 0xca, 0x64, 0x7f, 0x00, 0xf4, 0xbe, 0xa1, 0xff, + 0xe2, 0x12, 0x6c, 0xff, 0xbd, 0x77, 0xae, 0xff, 0xf9, 0x4b, 0x36, 0x00, 0xb0, 0xe3, 0x79, + 0xff, 0xa2, 0x2a, 0x29, 0x00, 0xcd, 0x06, 0xbc, 0xff, 0x2d, 0xa3, 0x7e, 0x00, 0xa9, 0x08, + 0xa1, 0xff, 0xbf, 0x81, 0xd0, 0xff, 0x4f, 0x03, 0x73, 0x00, 0xb0, 0x99, 0x0c, 0x00, 0xbd, + 0x6f, 0xf8, 0xff, 0x6b, 0x02, 0x05, 0x00, 0xc1, 0xe1, 0xba, 0xff, 0x81, 0x69, 0x67, 0xff, + 0x7f, 0x96, 0x98, 0x00, 0x15, 0x00, 0x15, 0xd0, 0x06, 0x15, 0xda, 0x06, 0x2c, 0x15, 0x6a, + 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10, + 0xf3, 0x5a, 0x00, 0x00, 0x18, 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00, + 0x00, 0x00, 0xa8, 0x03, 0xf4, 0xa7, 0x01, 0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, + 0x55, 0x6f, 0xc5, 0xe4, 0x9f, 0x1a, 0x00, 0x00, 0x47, 0x89, 0x0a, 0xe8, 0x58, 0xf0, 0xff, + 0xff, 0x63, 0xee, 0x21, 0xdd, 0xdd, 0xca, 0xff, 0xff, 0xbe, 0x6f, 0x3b, 0xaa, 0xe9, 0x3d, + 0x00, 0x00, 0xd6, 0x91, 0x2a, 0xb7, 0x08, 0x02, 0x00, 0x00, 0x75, 0x45, 0x2c, 0xd7, 0x76, + 0x0c, 0x00, 0x00, 0x54, 0x49, 0x92, 0x44, 0x9c, 0xbf, 0xff, 0xff, 0x41, 0xa9, 0x6d, 0xec, + 0x7a, 0xd0, 0xff, 0xff, 0x27, 0xa0, 0x23, 0x41, 0x44, 0xc1, 0xff, 0xff, 0x18, 0xd4, 0xe1, + 0x30, 0xd3, 0xe0, 0xff, 0xff, 0x59, 0xac, 0x14, 0xf4, 0xec, 0x58, 0x00, 0x00, 0x2c, 0x17, + 0x29, 0x57, 0x44, 0x13, 0x00, 0x00, 0xa2, 0x0d, 0x4a, 0xcc, 0x63, 0xff, 0xff, 0xff, 0x81, + 0x33, 0xbc, 0xda, 0xd5, 0xda, 0xff, 0xff, 0x4c, 0x05, 0xf4, 0x78, 0x19, 0xea, 0xff, 0xff, + 0x06, 0x71, 0x25, 0xde, 0x5a, 0xaf, 0xff, 0xff, 0x95, 0x32, 0x5f, 0x76, 0x98, 0xb3, 0xff, + 0xff, 0xf1, 0x34, 0x3c, 0xbf, 0xa8, 0xbe, 0xff, 0xff, 0x27, 0x73, 0x40, 0x0c, 0x7d, 0xcd, + 0xff, 0xff, 0x68, 0xa9, 0xc2, 0xe9, 0x2c, 0x03, 0x00, 0x00, 0x3f, 0x79, 0xd9, 0x04, 0x8c, + 0xe5, 0xff, 0xff, 0x91, 0xb4, 0x9b, 0xe3, 0x8f, 0x21, 0x00, 0x00, 0xb8, 0x20, 0xc8, 0xc2, + 0x4d, 0xa6, 0xff, 0xff, 0x47, 0xfa, 0xde, 0x36, 0x4a, 0xf3, 0xff, 0xff, 0x72, 0x80, 0x94, + 0x59, 0xdd, 0x4e, 0x00, 0x00, 0x29, 0xe4, 0xd6, 0x43, 0xb0, 0xf0, 0xff, 0xff, 0x68, 0x36, + 0xbc, 0x2d, 0xd1, 0xa9, 0xff, 0xff, 0xbc, 0xe4, 0xbe, 0xd7, 0xed, 0x1b, 0x00, 0x00, 0x02, + 0x8b, 0xcb, 0xd7, 0xed, 0x47, 0x00, 0x00, 0x3c, 0x06, 0xe4, 0xda, 0xc7, 0x47, 0x00, 0x00, + 0xf3, 0x39, 0x55, 0x28, 0x97, 0xba, 0xff, 0xff, 0x07, 0x79, 0x38, 0x4e, 0xe0, 0x21, 0x00, + 0x00, 0xde, 0xed, 0x1c, 0x23, 0x09, 0x49, 0x00, 0x00, 0x49, 0x46, 0x49, 0x5d, 0x8f, 0x34, + 0x00, 0x00, 0x38, 0x18, 0x50, 0xf6, 0xa1, 0x11, 0x00, 0x00, 0xdf, 0xb8, 0x19, 0x14, 0xd1, + 0xe1, 0xff, 0xff, 0x2c, 0x56, 0x72, 0x93, 0x64, 0x3f, 0x00, 0x00, 0x1c, 0xe0, 0xbe, 0x87, + 0x7d, 0xf9, 0xff, 0xff, 0x73, 0x0e, 0x3c, 0x01, 0x91, 0xf9, 0xff, 0xff, 0xb2, 0x37, 0x85, + 0x81, 0x5f, 0x54, 0x00, 0x00, 0x58, 0x44, 0xb0, 0x1a, 0xac, 0xbb, 0xff, 0xff, 0x36, 0xbf, + 0xbe, 0x5e, 0x22, 0xff, 0xff, 0xff, 0x06, 0x20, 0xa0, 0x23, 0x0d, 0x3b, 0x00, 0x00, 0x19, + 0xc6, 0x49, 0x0a, 0x00, 0xcf, 0xff, 0xff, 0x4f, 0xcd, 0xc6, 0x95, 0x4b, 0xf1, 0xff, 0xff, + 0xa3, 0x59, 0xaf, 0x65, 0xec, 0xe9, 0xff, 0xff, 0x58, 0xef, 0x05, 0x50, 0x63, 0xe4, 0xff, + 0xff, 0xc7, 0x6a, 0x9e, 0xf1, 0x69, 0x20, 0x00, 0x00, 0xd1, 0xb3, 0xc9, 0x14, 0xb2, 0x29, + 0x00, 0x00, 0x1d, 0x48, 0x16, 0x70, 0xf0, 0x40, 0x00, 0x00, 0x01, 0xc0, 0x85, 0xef, 0x0c, + 0xa5, 0xff, 0xff, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x15, 0x00, 0x15, 0x90, + 0x0d, 0x15, 0x9a, 0x0d, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, + 0x02, 0x28, 0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, + 0xff, 0xff, 0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, + 0x75, 0xdd, 0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc8, 0x06, 0xf4, 0x47, 0x03, + 0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x05, 0x49, 0xf7, 0xfc, 0x89, 0x3d, 0x3e, + 0x20, 0x07, 0x72, 0x3e, 0xa1, 0x66, 0x81, 0x67, 0x80, 0x23, 0x78, 0x06, 0x68, 0x0e, 0x78, + 0xf5, 0x08, 0xed, 0x20, 0xcd, 0x0e, 0x7f, 0x9c, 0x70, 0xa0, 0xb9, 0x16, 0x44, 0xb2, 0x41, + 0x62, 0xba, 0x82, 0xad, 0xe1, 0x12, 0x9b, 0xa6, 0x53, 0x8d, 0x20, 0x27, 0xd5, 0x84, 0x63, + 0xb8, 0x07, 0x4b, 0x5b, 0xa4, 0x1c, 0xa4, 0x1c, 0x17, 0xbf, 0x4b, 0x00, 0x24, 0x04, 0x56, + 0xa8, 0x52, 0xaf, 0x33, 0xf7, 0xad, 0x7c, 0xc8, 0x83, 0x25, 0x13, 0xaf, 0x80, 0x25, 0x6f, + 0xbd, 0xd1, 0x15, 0x69, 0x64, 0x20, 0x7b, 0xd7, 0x33, 0xba, 0x66, 0x29, 0x8a, 0x00, 0xda, + 0x42, 0x07, 0x2c, 0x6c, 0x39, 0x76, 0x9f, 0xdc, 0x17, 0xad, 0xb6, 0x58, 0xdf, 0x5f, 0x00, + 0x18, 0x3a, 0xae, 0x1c, 0xd6, 0x5f, 0x9d, 0x78, 0x8d, 0x73, 0xdd, 0x3e, 0xd6, 0x18, 0x33, + 0x40, 0xe4, 0x36, 0xde, 0xb0, 0xb7, 0x33, 0x2a, 0x6b, 0x08, 0x03, 0x6c, 0x6d, 0x8f, 0x13, + 0x93, 0xd0, 0xd7, 0x87, 0x62, 0x63, 0x53, 0xfb, 0xd8, 0xbb, 0xc9, 0x54, 0x90, 0xd6, 0xa9, + 0x8f, 0xc8, 0x60, 0xbd, 0xec, 0x75, 0x23, 0x9a, 0x21, 0xec, 0xe4, 0x86, 0x43, 0xd7, 0xc1, + 0x88, 0xdc, 0x82, 0x00, 0x32, 0x79, 0xc9, 0x2b, 0x70, 0x85, 0xb7, 0x25, 0xa1, 0xcc, 0x7d, + 0x0b, 0x29, 0x03, 0xea, 0x80, 0xff, 0x9b, 0xf3, 0x24, 0x7f, 0xd1, 0xff, 0xf0, 0x22, 0x65, + 0x85, 0x99, 0x17, 0x63, 0xc2, 0xc0, 0xb7, 0x62, 0x05, 0xda, 0x7a, 0xa0, 0xc3, 0x2a, 0x6f, + 0x1f, 0xee, 0x1f, 0x31, 0xa8, 0x42, 0x80, 0xe4, 0xb7, 0x6c, 0xf6, 0xac, 0x47, 0xb0, 0x17, + 0x69, 0xcb, 0xff, 0x66, 0x8a, 0xd6, 0x25, 0x00, 0xf3, 0xcf, 0x0a, 0xaf, 0xf8, 0x92, 0x8a, + 0xa0, 0xdf, 0x71, 0x13, 0x8d, 0x9d, 0xff, 0x7e, 0xe0, 0x0a, 0x52, 0xf1, 0x97, 0x01, 0xa9, + 0x73, 0x27, 0xfd, 0x63, 0x58, 0x00, 0x32, 0xa6, 0xf6, 0x78, 0xb8, 0xe4, 0xfd, 0x20, 0x7c, + 0x90, 0xee, 0xad, 0x8c, 0xc9, 0x71, 0x35, 0x66, 0x71, 0x3c, 0xe0, 0xe4, 0x0b, 0xbb, 0xa0, + 0x50, 0xe9, 0xf2, 0x81, 0x1d, 0x3a, 0x95, 0x94, 0x00, 0xd5, 0x49, 0x00, 0x07, 0xdf, 0x21, + 0x53, 0x36, 0x8d, 0x9e, 0xd9, 0xa5, 0x52, 0x4d, 0x0d, 0x29, 0x74, 0xf0, 0x40, 0xbd, 0xda, + 0x63, 0x4e, 0xdd, 0x91, 0x8e, 0xa6, 0xa7, 0xf6, 0x78, 0x58, 0x3b, 0x0a, 0x5c, 0x60, 0x3c, + 0x15, 0x34, 0xf8, 0x2c, 0x21, 0xe3, 0x56, 0x1b, 0x9e, 0xd9, 0x56, 0xd3, 0x13, 0x2e, 0x80, + 0x2c, 0x36, 0xda, 0x1d, 0xc8, 0xfb, 0x52, 0xee, 0x17, 0xb3, 0x2b, 0xf3, 0xd2, 0xeb, 0x29, + 0xa0, 0x37, 0xa0, 0x12, 0xce, 0x1c, 0x50, 0x6a, 0xf4, 0x11, 0xcd, 0x96, 0x88, 0x3f, 0x43, + 0x78, 0xc0, 0x2c, 0x53, 0x6c, 0xa6, 0xdf, 0xb9, 0x9e, 0x93, 0xd4, 0x1e, 0xa9, 0x7f, 0x67, + 0xa6, 0xc1, 0x80, 0x46, 0x0f, 0x63, 0x7d, 0x15, 0xf2, 0x4c, 0xc5, 0xda, 0x11, 0x9a, 0x20, + 0x67, 0x27, 0xe8, 0x00, 0xec, 0x03, 0x1d, 0x15, 0xa7, 0x92, 0xb3, 0x1f, 0xda, 0x20, 0x92, + 0xd8, 0x00, 0xfb, 0x06, 0x80, 0xeb, 0x4b, 0x0c, 0xc1, 0x1f, 0x49, 0x40, 0x06, 0x8d, 0x8a, + 0xf8, 0x34, 0xb1, 0x0c, 0x1d, 0x20, 0xd0, 0x47, 0xe5, 0xb1, 0x7e, 0xf7, 0xe4, 0xb4, 0x7e, + 0x9c, 0x84, 0x18, 0x61, 0x32, 0x4f, 0xc0, 0xc2, 0xb2, 0xcc, 0x63, 0xf6, 0xe1, 0x16, 0xd6, + 0xd9, 0x4b, 0x74, 0x13, 0x01, 0xa1, 0xe2, 0x00, 0xb7, 0x9e, 0xc1, 0x3a, 0xc5, 0xaf, 0xe8, + 0x54, 0x07, 0x2a, 0x20, 0xfd, 0x2c, 0x6f, 0xb9, 0x80, 0x18, 0x92, 0x87, 0xa0, 0x81, 0x24, + 0x60, 0x47, 0x17, 0x4f, 0xbc, 0xbe, 0xf5, 0x03, 0x69, 0x80, 0xe3, 0x10, 0x54, 0xd6, 0x68, + 0x7d, 0x75, 0xd3, 0x0a, 0x45, 0x38, 0x9e, 0xa9, 0xfd, 0x05, 0x40, 0xd2, 0x1e, 0x6f, 0x5c, + 0x30, 0x10, 0xfe, 0x9b, 0x9f, 0x6d, 0xc0, 0x9d, 0x6c, 0x17, 0x7d, 0x00, 0x09, 0xb6, 0x8a, + 0x31, 0x8e, 0x1b, 0x6b, 0x84, 0x1e, 0x79, 0xce, 0x10, 0x55, 0x59, 0x6a, 0x40, 0x16, 0xdc, + 0x9a, 0xcf, 0x4d, 0xb0, 0x8f, 0xac, 0xe3, 0x8d, 0xee, 0xd2, 0xef, 0x01, 0x8c, 0xe0, 0x2b, + 0x24, 0xe5, 0xb4, 0xe1, 0x86, 0x72, 0x00, 0x30, 0x07, 0xce, 0x02, 0x23, 0x41, 0x33, 0x40, + 0xf0, 0x9b, 0xc2, 0x2d, 0x30, 0xec, 0x3b, 0x17, 0xb2, 0x8f, 0x64, 0x7d, 0xcd, 0x70, 0x9e, + 0x80, 0x22, 0xb5, 0xdf, 0x6d, 0x2a, 0x43, 0xd4, 0x2b, 0x5a, 0xf6, 0x96, 0xa6, 0xea, 0x91, + 0x62, 0x80, 0x39, 0xf2, 0x5a, 0x8e, 0xc0, 0xb9, 0x29, 0x99, 0x17, 0xe7, 0x35, 0x2c, 0xf6, + 0x4d, 0x18, 0x00, 0x48, 0x10, 0x85, 0xb4, 0x3f, 0x89, 0x60, 0x49, 0x6e, 0xf0, 0xcd, 0x9d, + 0x92, 0xeb, 0x96, 0x80, 0xcf, 0xf9, 0xf1, 0x46, 0x1d, 0xc0, 0x49, 0xb3, 0x36, 0x2e, 0x24, + 0xc8, 0xdb, 0x41, 0x72, 0x20, 0xf5, 0xde, 0x5c, 0xf9, 0x4a, 0x6e, 0xa0, 0x0b, 0x13, 0xfc, + 0x2d, 0x17, 0x07, 0x16, 0x5e, 0x00, 0x3c, 0x54, 0x41, 0x0e, 0xa2, 0x0d, 0xf3, 0x48, 0x12, + 0x2e, 0x7c, 0xab, 0x3c, 0x59, 0x1c, 0x40, 0xca, 0xb0, 0x71, 0xc7, 0x29, 0xf0, 0xbb, 0x9f, + 0xf4, 0x3f, 0x25, 0x49, 0xad, 0xc2, 0x8f, 0x80, 0x04, 0x38, 0x6d, 0x35, 0x02, 0xca, 0xe6, + 0x02, 0x83, 0x89, 0x4e, 0x74, 0xdb, 0x08, 0x5a, 0x80, 0x13, 0x99, 0xd4, 0x26, 0xc1, 0x27, + 0xce, 0xb0, 0x98, 0x99, 0xca, 0xf6, 0x3e, 0x50, 0x49, 0xd0, 0xbf, 0xcb, 0x6f, 0xbe, 0x5b, + 0x92, 0x63, 0xde, 0x94, 0xd3, 0x8f, 0x07, 0x06, 0x0f, 0x2b, 0x80, 0x36, 0xf1, 0x77, 0xf6, + 0x29, 0x33, 0x13, 0xa9, 0x4a, 0x55, 0x3d, 0x6c, 0xca, 0xdb, 0x4e, 0x40, 0xc4, 0x95, 0x54, + 0xf4, 0xe2, 0x8c, 0x1b, 0xa0, 0xfe, 0x30, 0x50, 0x9d, 0x62, 0xbc, 0x5c, 0x00, 0xb4, 0xc4, + 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, 0xc0, 0x00, 0x00, 0x00, 0x01, 0x4b, + 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff, 0xff, 0xff, + 0x15, 0x02, 0x19, 0x4c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, + 0x65, 0x6d, 0x61, 0x15, 0x06, 0x00, 0x15, 0x02, 0x25, 0x02, 0x18, 0x06, 0x64, 0x65, 0x63, + 0x37, 0x70, 0x34, 0x25, 0x0a, 0x15, 0x08, 0x15, 0x0e, 0x00, 0x15, 0x04, 0x25, 0x02, 0x18, + 0x07, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x25, 0x0a, 0x15, 0x0a, 0x15, 0x1c, 0x00, + 0x15, 0x0e, 0x15, 0x20, 0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31, + 0x38, 0x25, 0x0a, 0x15, 0x24, 0x15, 0x4c, 0x00, 0x16, 0x6a, 0x19, 0x1c, 0x19, 0x3c, 0x26, + 0x08, 0x1c, 0x15, 0x02, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06, 0x64, 0x65, 0x63, + 0x37, 0x70, 0x34, 0x15, 0x02, 0x16, 0x6a, 0x16, 0xf6, 0x03, 0x16, 0xfe, 0x03, 0x26, 0x08, + 0x3c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, + 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0x86, 0x04, + 0x1c, 0x15, 0x04, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x31, + 0x34, 0x70, 0x35, 0x15, 0x02, 0x16, 0x6a, 0x16, 0xa6, 0x07, 0x16, 0xb0, 0x07, 0x26, 0x86, + 0x04, 0x3c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x18, + 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, + 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xb6, 0x0b, 0x1c, 0x15, 0x0e, 0x19, 0x35, 0x06, + 0x08, 0x00, 0x19, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x15, 0x02, + 0x16, 0x6a, 0x16, 0x86, 0x0e, 0x16, 0x90, 0x0e, 0x26, 0xb6, 0x0b, 0x3c, 0x36, 0x02, 0x28, + 0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff, + 0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, + 0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, + 0x00, 0x00, 0x16, 0xa2, 0x19, 0x16, 0x6a, 0x00, 0x19, 0x2c, 0x18, 0x18, 0x6f, 0x72, 0x67, + 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x2e, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x05, 0x33, 0x2e, 0x30, 0x2e, 0x31, 0x00, 0x18, + 0x29, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, + 0x72, 0x6b, 0x2e, 0x73, 0x71, 0x6c, 0x2e, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2e, + 0x72, 0x6f, 0x77, 0x2e, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xf4, 0x01, + 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, + 0x22, 0x2c, 0x22, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x22, 0x3a, 0x5b, 0x7b, 0x22, 0x6e, + 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x37, 0x70, 0x34, 0x22, 0x2c, 0x22, + 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, + 0x37, 0x2c, 0x34, 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, + 0x22, 0x3a, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, + 0x61, 0x22, 0x3a, 0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, + 0x22, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, + 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x31, 0x34, 0x2c, 0x35, + 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, + 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, + 0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, + 0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, + 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x33, 0x38, 0x2c, 0x31, 0x38, 0x29, + 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, 0x72, + 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x7b, + 0x7d, 0x7d, 0x5d, 0x7d, 0x00, 0x18, 0x4a, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, + 0x6d, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x31, 0x30, + 0x2e, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x20, 0x61, 0x38, 0x39, 0x64, 0x66, + 0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66, 0x36, 0x36, 0x33, 0x33, 0x64, + 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, + 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c, + 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + unsigned int decimals_parquet_len = 2366; + + cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(decimals_parquet), decimals_parquet_len}); + auto result = cudf::io::read_parquet(read_opts); + + auto validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 50; }); + + EXPECT_EQ(result.tbl->view().num_columns(), 3); + + int32_t col0_data[] = { + -2354584, -190275, 8393572, 6446515, -5687920, -1843550, -6897687, -6780385, 3428529, + 5842056, -4312278, -4450603, -7516141, 2974667, -4288640, 1065090, -9410428, 7891355, + 1076244, -1975984, 6999466, 2666959, 9262967, 7931374, -1370640, 451074, 8799111, + 3026424, -6803730, 5098297, -1414370, -9662848, 2499991, 658765, 8348874, -6177036, + -9694494, -5343299, 3558393, -8789072, 2697890, -4454707, 8299309, -6223703, -3112513, + 7537487, 825776, -495683, 328299, -4529727, 0, -9999999, 9999999}; + + EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), + sizeof(col0_data) / sizeof(col0_data[0])); + cudf::test::fixed_point_column_wrapper col0( + std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); + + int64_t col1_data[] = {29274040266581, -17210335917753, -58420730139037, + 68073792696254, 2236456014294, 13704555677045, + -70797090469548, -52248605513407, -68976081919961, + -34277313883112, 97774730521689, 21184241014572, + -670882460254, -40862944054399, -24079852370612, + -88670167797498, -84007574359403, -71843004533519, + -55538016554201, 3491435293032, -29085437167297, + 36901882672273, -98622066122568, -13974902998457, + 86712597643378, -16835133643735, -94759096142232, + 30708340810940, 79086853262082, 78923696440892, + -76316597208589, 37247268714759, 80303592631774, + 57790350050889, 19387319851064, -33186875066145, + 69701203023404, -7157433049060, -7073790423437, + 92769171617714, -75127120182184, -951893180618, + 64927618310150, -53875897154023, -16168039035569, + -24273449166429, -30359781249192, 35639397345991, + 45844829680593, 71401416837149, 0, + -99999999999999, 99999999999999}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), + sizeof(col1_data) / sizeof(col1_data[0])); + cudf::test::fixed_point_column_wrapper col1( + std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); + + cudf::io::parquet_reader_options read_strict_opts = read_opts; + read_strict_opts.set_columns({"dec7p4", "dec14p5"}); + EXPECT_NO_THROW(cudf::io::read_parquet(read_strict_opts)); + } + { + // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4) + // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6) + // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9) + unsigned char const fixed_len_bytes_decimal_parquet[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28, + 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, + 0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00, + 0x00, 0x26, 0x01, 0x03, 0x00, 0x00, 0x61, 0x10, 0xCF, 0x00, 0x0A, 0xA9, 0x08, 0x00, 0x77, + 0x58, 0x6F, 0x00, 0x6B, 0xEE, 0xA4, 0x00, 0x92, 0xF8, 0x94, 0x00, 0x2E, 0x18, 0xD4, 0x00, + 0x4F, 0x45, 0x33, 0x00, 0x97, 0x45, 0x72, 0x00, 0x0D, 0xC2, 0x75, 0x00, 0x76, 0xAA, 0xAA, + 0x00, 0x30, 0x9F, 0x86, 0x00, 0x4B, 0x9D, 0xB1, 0x00, 0x4E, 0x4B, 0x3B, 0x00, 0x01, 0x81, + 0x3B, 0x00, 0x22, 0xD4, 0x53, 0x00, 0x72, 0xC4, 0xAF, 0x00, 0x43, 0x9B, 0x72, 0x00, 0x1D, + 0x91, 0xC3, 0x00, 0x45, 0x27, 0x48, 0x15, 0x00, 0x15, 0xF4, 0x01, 0x15, 0xFA, 0x01, 0x2C, + 0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x06, 0x00, 0xD5, + 0xD7, 0x31, 0x99, 0xA6, 0x18, 0x06, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x00, 0x00, + 0x7A, 0xF0, 0x79, 0x04, 0x00, 0x00, 0x00, 0x24, 0x01, 0x03, 0x02, 0x00, 0x54, 0x23, 0xCF, + 0x13, 0x0A, 0x00, 0x07, 0x22, 0xB1, 0x21, 0x7E, 0x00, 0x64, 0x19, 0xD6, 0xD2, 0xA5, 0x00, + 0x61, 0x7F, 0xF6, 0xB9, 0xB0, 0x00, 0xD0, 0x7F, 0x9C, 0xA9, 0xE9, 0x00, 0x65, 0x58, 0xF0, + 0xAD, 0xFB, 0x00, 0xBC, 0x61, 0xE2, 0x03, 0xDA, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, + 0x63, 0x4B, 0x4C, 0xFE, 0x45, 0x00, 0x7A, 0xA0, 0xD8, 0xD1, 0xC0, 0x00, 0xC0, 0x63, 0xF7, + 0x9D, 0x0A, 0x00, 0x88, 0x22, 0x0F, 0x1B, 0x25, 0x00, 0x1A, 0x80, 0x56, 0x34, 0xC7, 0x00, + 0x5F, 0x48, 0x61, 0x09, 0x7C, 0x00, 0x61, 0xEF, 0x92, 0x42, 0x2F, 0x00, 0xD5, 0xD7, 0x31, + 0x99, 0xA6, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x71, 0xDD, 0xE2, 0x22, 0x7B, 0x00, + 0x54, 0xBF, 0xAE, 0xE9, 0x3C, 0x15, 0x00, 0x15, 0xD4, 0x02, 0x15, 0xDC, 0x02, 0x2C, 0x15, + 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x04, 0x28, 0x09, 0x00, 0x7D, 0xFE, + 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x18, 0x09, 0x00, 0x03, 0x9C, 0xCD, 0x5A, 0xAC, 0xBB, + 0xF1, 0xE3, 0x00, 0x00, 0x00, 0xAA, 0x01, 0xF0, 0xA9, 0x04, 0x00, 0x00, 0x00, 0x07, 0xBF, + 0xBF, 0x0F, 0x00, 0x7D, 0xFE, 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x00, 0x7D, 0x9A, 0xCB, + 0xDA, 0x4B, 0x10, 0x8B, 0xAC, 0x00, 0x20, 0xBA, 0x97, 0x87, 0x2E, 0x3B, 0x4E, 0x04, 0x00, + 0x15, 0xBB, 0xC2, 0xDF, 0x2D, 0x25, 0x08, 0xB6, 0x00, 0x5C, 0x67, 0x0E, 0x36, 0x30, 0xF1, + 0xAC, 0xA4, 0x00, 0x44, 0xF1, 0x8E, 0xFB, 0x17, 0x5E, 0xE1, 0x96, 0x00, 0x64, 0x69, 0xF9, + 0x66, 0x3F, 0x11, 0xED, 0xB9, 0x00, 0x45, 0xB5, 0xDA, 0x14, 0x9C, 0xA3, 0xFA, 0x64, 0x00, + 0x26, 0x5F, 0xDE, 0xD7, 0x67, 0x95, 0xEF, 0xB1, 0x00, 0x35, 0xDB, 0x9B, 0x88, 0x46, 0xD0, + 0xA1, 0x0E, 0x00, 0x45, 0xA9, 0x92, 0x8E, 0x89, 0xD1, 0xAC, 0x4C, 0x00, 0x4C, 0xF1, 0xCB, + 0x27, 0x82, 0x3A, 0x7D, 0xB7, 0x00, 0x64, 0xD3, 0xD2, 0x2F, 0x9C, 0x83, 0x16, 0x75, 0x00, + 0x15, 0xDF, 0xC2, 0xA9, 0x63, 0xB8, 0x33, 0x65, 0x00, 0x27, 0x40, 0x28, 0x97, 0x05, 0x8E, + 0xE3, 0x46, 0x00, 0x03, 0x9C, 0xCD, 0x5A, 0xAC, 0xBB, 0xF1, 0xE3, 0x00, 0x22, 0x23, 0xF5, + 0xE8, 0x9D, 0x55, 0xD4, 0x9C, 0x00, 0x25, 0xB9, 0xD8, 0x87, 0x2D, 0xF1, 0xF2, 0x17, 0x15, + 0x02, 0x19, 0x4C, 0x48, 0x0C, 0x73, 0x70, 0x61, 0x72, 0x6B, 0x5F, 0x73, 0x63, 0x68, 0x65, + 0x6D, 0x61, 0x15, 0x06, 0x00, 0x15, 0x0E, 0x15, 0x08, 0x15, 0x02, 0x18, 0x06, 0x64, 0x65, + 0x63, 0x37, 0x70, 0x33, 0x25, 0x0A, 0x15, 0x06, 0x15, 0x0E, 0x00, 0x15, 0x0E, 0x15, 0x0C, + 0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x25, 0x0A, 0x15, + 0x16, 0x15, 0x18, 0x00, 0x15, 0x0E, 0x15, 0x12, 0x15, 0x02, 0x18, 0x07, 0x64, 0x65, 0x63, + 0x32, 0x30, 0x70, 0x31, 0x25, 0x0A, 0x15, 0x02, 0x15, 0x28, 0x00, 0x16, 0x28, 0x19, 0x1C, + 0x19, 0x3C, 0x26, 0x08, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06, + 0x64, 0x65, 0x63, 0x37, 0x70, 0x33, 0x15, 0x02, 0x16, 0x28, 0x16, 0xEE, 0x01, 0x16, 0xF4, + 0x01, 0x26, 0x08, 0x3C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00, + 0x01, 0x81, 0x3B, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, + 0x26, 0xFC, 0x01, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x08, 0x64, + 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x15, 0x02, 0x16, 0x28, 0x16, 0xC2, 0x02, 0x16, + 0xC8, 0x02, 0x26, 0xFC, 0x01, 0x3C, 0x36, 0x02, 0x28, 0x06, 0x00, 0xD5, 0xD7, 0x31, 0x99, + 0xA6, 0x18, 0x06, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, + 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xC4, 0x04, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, + 0x08, 0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x32, 0x30, 0x70, 0x31, 0x15, 0x02, 0x16, + 0x28, 0x16, 0xAE, 0x03, 0x16, 0xB6, 0x03, 0x26, 0xC4, 0x04, 0x3C, 0x36, 0x04, 0x28, 0x09, + 0x00, 0x7D, 0xFE, 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x18, 0x09, 0x00, 0x03, 0x9C, 0xCD, + 0x5A, 0xAC, 0xBB, 0xF1, 0xE3, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, + 0x00, 0x00, 0x16, 0xDE, 0x07, 0x16, 0x28, 0x00, 0x19, 0x2C, 0x18, 0x18, 0x6F, 0x72, 0x67, + 0x2E, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2E, 0x73, 0x70, 0x61, 0x72, 0x6B, 0x2E, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x18, 0x05, 0x33, 0x2E, 0x30, 0x2E, 0x31, 0x00, 0x18, + 0x29, 0x6F, 0x72, 0x67, 0x2E, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2E, 0x73, 0x70, 0x61, + 0x72, 0x6B, 0x2E, 0x73, 0x71, 0x6C, 0x2E, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2E, + 0x72, 0x6F, 0x77, 0x2E, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xF4, 0x01, + 0x7B, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, + 0x22, 0x2C, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x73, 0x22, 0x3A, 0x5B, 0x7B, 0x22, 0x6E, + 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x37, 0x70, 0x33, 0x22, 0x2C, 0x22, + 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, + 0x37, 0x2C, 0x33, 0x29, 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, + 0x22, 0x3A, 0x74, 0x72, 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, + 0x61, 0x22, 0x3A, 0x7B, 0x7D, 0x7D, 0x2C, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, + 0x22, 0x64, 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x22, 0x2C, 0x22, 0x74, 0x79, 0x70, + 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, 0x31, 0x32, 0x2C, + 0x31, 0x31, 0x29, 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, 0x22, + 0x3A, 0x74, 0x72, 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, + 0x22, 0x3A, 0x7B, 0x7D, 0x7D, 0x2C, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x22, + 0x64, 0x65, 0x63, 0x32, 0x30, 0x70, 0x31, 0x22, 0x2C, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, + 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, 0x32, 0x30, 0x2C, 0x31, 0x29, + 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, 0x22, 0x3A, 0x74, 0x72, + 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x7B, + 0x7D, 0x7D, 0x5D, 0x7D, 0x00, 0x18, 0x4A, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2D, + 0x6D, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x20, 0x31, 0x2E, 0x31, 0x30, + 0x2E, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6C, 0x64, 0x20, 0x61, 0x38, 0x39, 0x64, 0x66, + 0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66, 0x36, 0x36, 0x33, 0x33, 0x64, + 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, + 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, + 0x00, 0x00, 0x00, 0xC5, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31, + }; + + unsigned int parquet_len = 1226; + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{ + reinterpret_cast(fixed_len_bytes_decimal_parquet), parquet_len}); + auto result = cudf::io::read_parquet(read_opts); + EXPECT_EQ(result.tbl->view().num_columns(), 3); + + auto validity_c0 = cudf::test::iterators::nulls_at({19}); + int32_t col0_data[] = {6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, + 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, + 2282579, 7521455, 4430706, 1937859, 4532040, 0}; + + EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), + sizeof(col0_data) / sizeof(col0_data[0])); + cudf::test::fixed_point_column_wrapper col0( + std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); + + auto validity_c1 = cudf::test::iterators::nulls_at({18}); + int64_t col1_data[] = {361378026250, + 30646804862, + 429930238629, + 418758703536, + 895494171113, + 435283865083, + 809096053722, + -999999999999, + 426465099333, + 526684574144, + 826310892810, + 584686967589, + 113822282951, + 409236212092, + 420631167535, + 918438386086, + -999999999999, + 489053889147, + 0, + 363993164092}; + + EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), + sizeof(col1_data) / sizeof(col1_data[0])); + cudf::test::fixed_point_column_wrapper col1( + std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); + + auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); + __int128_t col2_data[] = {9078697037144433659, + 9050770539577117612, + 2358363961733893636, + 1566059559232276662, + 6658306200002735268, + 4967909073046397334, + 0, + 7235588493887532473, + 5023160741463849572, + 2765173712965988273, + 3880866513515749646, + 5019704400576359500, + 5544435986818825655, + 7265381725809874549, + 0, + 1576192427381240677, + 2828305195087094598, + 260308667809395171, + 2460080200895288476, + 2718441925197820439}; + + EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), + sizeof(col2_data) / sizeof(col2_data[0])); + cudf::test::fixed_point_column_wrapper<__int128_t> col2( + std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2); + } +} + +TEST_F(ParquetReaderTest, EmptyOutput) +{ + cudf::test::fixed_width_column_wrapper c0; + cudf::test::strings_column_wrapper c1; + cudf::test::fixed_point_column_wrapper c2({}, numeric::scale_type{2}); + cudf::test::lists_column_wrapper _c3{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}; + auto c3 = cudf::empty_like(_c3); + + cudf::test::fixed_width_column_wrapper sc0; + cudf::test::strings_column_wrapper sc1; + cudf::test::lists_column_wrapper _sc2{{1, 2}}; + std::vector> struct_children; + struct_children.push_back(sc0.release()); + struct_children.push_back(sc1.release()); + struct_children.push_back(cudf::empty_like(_sc2)); + cudf::test::structs_column_wrapper c4(std::move(struct_children)); + + table_view expected({c0, c1, c2, *c3, c4}); + + // set precision on the decimal column + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[2].set_decimal_precision(1); + + auto filepath = temp_env->get_temp_filepath("EmptyOutput.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + out_args.set_metadata(std::move(expected_metadata)); + cudf::io::write_parquet(out_args); + + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_args); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TEST_F(ParquetReaderTest, EmptyColumnsParam) +{ + srand(31337); + auto const expected = create_random_fixed_table(2, 4, false); + + std::vector out_buffer; + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .columns({}); + auto const result = cudf::io::read_parquet(read_opts); + + EXPECT_EQ(result.tbl->num_columns(), 0); + EXPECT_EQ(result.tbl->num_rows(), 0); +} + +TEST_F(ParquetReaderTest, BinaryAsStrings) +{ + std::vector strings{ + "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + auto const num_rows = strings.size(); + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto seq_col3 = random_values(num_rows); + auto validity = cudf::test::iterators::no_nulls(); + + column_wrapper int_col{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper string_col{strings.begin(), strings.end()}; + column_wrapper float_col{seq_col2.begin(), seq_col2.end(), validity}; + cudf::test::lists_column_wrapper list_int_col{ + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}; + + auto output = table_view{{int_col, string_col, float_col, string_col, list_int_col}}; + cudf::io::table_input_metadata output_metadata(output); + output_metadata.column_metadata[0].set_name("col_other"); + output_metadata.column_metadata[1].set_name("col_string"); + output_metadata.column_metadata[2].set_name("col_float"); + output_metadata.column_metadata[3].set_name("col_string2").set_output_as_binary(true); + output_metadata.column_metadata[4].set_name("col_binary").set_output_as_binary(true); + + auto filepath = temp_env->get_temp_filepath("BinaryReadStrings.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output) + .metadata(std::move(output_metadata)); + cudf::io::write_parquet(out_opts); + + auto expected_string = table_view{{int_col, string_col, float_col, string_col, string_col}}; + auto expected_mixed = table_view{{int_col, string_col, float_col, list_int_col, list_int_col}}; + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .set_column_schema({{}, {}, {}, {}, {}}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_string, result.tbl->view()); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_string, result.tbl->view()); + + std::vector md{ + {}, + {}, + {}, + cudf::io::reader_column_schema().set_convert_binary_to_strings(false), + cudf::io::reader_column_schema().set_convert_binary_to_strings(false)}; + + cudf::io::parquet_reader_options mixed_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .set_column_schema(md); + result = cudf::io::read_parquet(mixed_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_mixed, result.tbl->view()); +} + +TEST_F(ParquetReaderTest, NestedByteArray) +{ + constexpr auto num_rows = 8; + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto seq_col3 = random_values(num_rows); + auto const validity = cudf::test::iterators::no_nulls(); + + column_wrapper int_col{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper float_col{seq_col2.begin(), seq_col2.end(), validity}; + cudf::test::lists_column_wrapper list_list_int_col{ + {{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}}, + {{'M', 'o', 'n', 'd', 'a', 'y'}, {'F', 'r', 'i', 'd', 'a', 'y'}}, + {{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}}, + {{'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}, + {{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}}, + {{'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}, + {{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}}, + {{'M', 'o', 'n', 'd', 'a', 'y'}, {'F', 'r', 'i', 'd', 'a', 'y'}}}; + + auto const expected = table_view{{int_col, float_col, list_list_int_col}}; + cudf::io::table_input_metadata output_metadata(expected); + output_metadata.column_metadata[0].set_name("col_other"); + output_metadata.column_metadata[1].set_name("col_float"); + output_metadata.column_metadata[2].set_name("col_binary").child(1).set_output_as_binary(true); + + auto filepath = temp_env->get_temp_filepath("NestedByteArray.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(output_metadata)); + cudf::io::write_parquet(out_opts); + + auto source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + + std::vector md{ + {}, + {}, + cudf::io::reader_column_schema().add_child( + cudf::io::reader_column_schema().set_convert_binary_to_strings(false))}; + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .set_column_schema(md); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TEST_F(ParquetReaderTest, StructByteArray) +{ + constexpr auto num_rows = 100; + + auto seq_col0 = random_values(num_rows); + auto const validity = cudf::test::iterators::no_nulls(); + + column_wrapper int_col{seq_col0.begin(), seq_col0.end(), validity}; + cudf::test::lists_column_wrapper list_of_int{{seq_col0.begin(), seq_col0.begin() + 50}, + {seq_col0.begin() + 50, seq_col0.end()}}; + auto struct_col = cudf::test::structs_column_wrapper{{list_of_int}, validity}; + + auto const expected = table_view{{struct_col}}; + EXPECT_EQ(1, expected.num_columns()); + cudf::io::table_input_metadata output_metadata(expected); + output_metadata.column_metadata[0] + .set_name("struct_binary") + .child(0) + .set_name("a") + .set_output_as_binary(true); + + auto filepath = temp_env->get_temp_filepath("StructByteArray.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(output_metadata)); + cudf::io::write_parquet(out_opts); + + std::vector md{cudf::io::reader_column_schema().add_child( + cudf::io::reader_column_schema().set_convert_binary_to_strings(false))}; + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .set_column_schema(md); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TEST_F(ParquetReaderTest, NestingOptimizationTest) +{ + // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. + constexpr cudf::size_type num_nesting_levels = 16; + static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); + constexpr cudf::size_type rows_per_level = 2; + + constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; + auto value_iter = thrust::make_counting_iterator(0); + auto validity = + cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); + cudf::test::fixed_width_column_wrapper values(value_iter, value_iter + num_values, validity); + + // ~256k values with num_nesting_levels = 16 + int total_values_produced = num_values; + auto prev_col = values.release(); + for (int idx = 0; idx < num_nesting_levels; idx++) { + auto const depth = num_nesting_levels - idx; + auto const num_rows = (1 << (num_nesting_levels - idx)); + + auto offsets_iter = cudf::detail::make_counting_transform_iterator( + 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); + total_values_produced += (num_rows + 1); + + cudf::test::fixed_width_column_wrapper offsets(offsets_iter, + offsets_iter + num_rows + 1); + auto c = cudf::make_lists_column(num_rows, offsets.release(), std::move(prev_col), 0, {}); + prev_col = std::move(c); + } + auto const& expect = prev_col; + + auto filepath = temp_env->get_temp_filepath("NestingDecodeCache.parquet"); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table_view{{*expect}}); + cudf::io::write_parquet(opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expect, result.tbl->get_column(0)); +} + +TEST_F(ParquetReaderTest, SingleLevelLists) +{ + unsigned char list_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, + 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, + 0x02, 0x19, 0x3c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x15, 0x02, 0x00, 0x35, 0x00, 0x18, 0x01, 0x66, 0x15, 0x02, 0x15, 0x06, 0x4c, 0x3c, 0x00, + 0x00, 0x00, 0x15, 0x02, 0x25, 0x04, 0x18, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x00, 0x16, 0x02, + 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, + 0x66, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x15, 0x00, 0x16, 0x04, 0x16, 0x56, 0x16, 0x56, 0x26, + 0x08, 0x3c, 0x18, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, + 0x28, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x1c, 0x15, + 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x56, 0x16, 0x02, 0x26, 0x08, 0x16, 0x56, + 0x14, 0x00, 0x00, 0x28, 0x13, 0x52, 0x41, 0x50, 0x49, 0x44, 0x53, 0x20, 0x53, 0x70, 0x61, 0x72, + 0x6b, 0x20, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x9f, 0x00, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + // read single level list reproducing parquet file + cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); + auto table = cudf::io::read_parquet(read_opts); + + auto const c0 = table.tbl->get_column(0); + EXPECT_TRUE(c0.type().id() == cudf::type_id::LIST); + + auto const lc = cudf::lists_column_view(c0); + auto const child = lc.child(); + EXPECT_TRUE(child.type().id() == cudf::type_id::INT32); +} + +TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) +{ + unsigned char list_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, + 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, + 0x02, 0x19, 0x3c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x15, 0x02, 0x00, 0x35, 0x00, 0x18, 0x01, 0x66, 0x15, 0x02, 0x15, 0x06, 0x4c, 0x3c, 0x00, + 0x00, 0x00, 0x15, 0x02, 0x25, 0x04, 0x18, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x00, 0x16, 0x02, + 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, + 0x66, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x15, 0x00, 0x16, 0x04, 0x16, 0x56, 0x16, 0x56, 0x26, + 0x08, 0x3c, 0x18, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, + 0x28, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x1c, 0x15, + 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x56, 0x16, 0x02, 0x26, 0x08, 0x16, 0x56, + 0x14, 0x00, 0x00, 0x28, 0x13, 0x52, 0x41, 0x50, 0x49, 0x44, 0x53, 0x20, 0x53, 0x70, 0x61, 0x72, + 0x6b, 0x20, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x9f, 0x00, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + auto reader = cudf::io::chunked_parquet_reader( + 1L << 31, + cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); + int iterations = 0; + while (reader.has_next() && iterations < 10) { + auto chunk = reader.read_chunk(); + } + EXPECT_TRUE(iterations < 10); +} + +TEST_F(ParquetReaderTest, ReorderedReadMultipleFiles) +{ + constexpr auto num_rows = 50'000; + constexpr auto cardinality = 20'000; + + // table 1 + auto str1 = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "cat " + std::to_string(i % cardinality); }); + auto cols1 = cudf::test::strings_column_wrapper(str1, str1 + num_rows); + + auto int1 = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % cardinality; }); + auto coli1 = cudf::test::fixed_width_column_wrapper(int1, int1 + num_rows); + + auto const expected1 = table_view{{cols1, coli1}}; + auto const swapped1 = table_view{{coli1, cols1}}; + + auto const filepath1 = temp_env->get_temp_filepath("LargeReorderedRead1.parquet"); + auto out_opts1 = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath1}, expected1) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(out_opts1); + + // table 2 + auto str2 = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "dog " + std::to_string(i % cardinality); }); + auto cols2 = cudf::test::strings_column_wrapper(str2, str2 + num_rows); + + auto int2 = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % cardinality) + cardinality; }); + auto coli2 = cudf::test::fixed_width_column_wrapper(int2, int2 + num_rows); + + auto const expected2 = table_view{{cols2, coli2}}; + auto const swapped2 = table_view{{coli2, cols2}}; + + auto const filepath2 = temp_env->get_temp_filepath("LargeReorderedRead2.parquet"); + auto out_opts2 = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath2}, expected2) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(out_opts2); + + // read in both files swapping the columns + auto read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}}) + .columns({"_col1", "_col0"}); + auto result = cudf::io::read_parquet(read_opts); + auto sliced = cudf::slice(result.tbl->view(), {0, num_rows, num_rows, 2 * num_rows}); + CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[0], swapped1); + CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[1], swapped2); +} + +TEST_F(ParquetReaderTest, FilterSimple) +{ + srand(31337); + auto written_table = create_random_fixed_table(9, 9, false); + + auto filepath = temp_env->get_temp_filepath("FilterSimple.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *written_table); + cudf::io::write_parquet(args); + + // Filtering AST - table[0] < RAND_MAX/2 + auto literal_value = cudf::numeric_scalar(RAND_MAX / 2); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_0 = cudf::ast::column_reference(0); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + auto predicate = cudf::compute_column(*written_table, filter_expression); + EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) + << "Predicate filter should return a boolean"; + auto expected = cudf::apply_boolean_mask(*written_table, *predicate); + // To make sure AST filters out some elements + EXPECT_LT(expected->num_rows(), written_table->num_rows()); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(filter_expression); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +auto create_parquet_with_stats(std::string const& filename) +{ + auto col0 = testdata::ascending(); + auto col1 = testdata::descending(); + auto col2 = testdata::unordered(); + + auto const expected = table_view{{col0, col1, col2}}; + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_uint32"); + expected_metadata.column_metadata[1].set_name("col_int64"); + expected_metadata.column_metadata[2].set_name("col_double"); + + auto const filepath = temp_env->get_temp_filepath(filename); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(expected_metadata)) + .row_group_size_rows(8000) + .stats_level(cudf::io::statistics_freq::STATISTICS_ROWGROUP); + cudf::io::write_parquet(out_opts); + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return std::pair{cudf::table{std::move(columns)}, filepath}; +} + +TEST_F(ParquetReaderTest, FilterIdentity) +{ + auto [src, filepath] = create_parquet_with_stats("FilterIdentity.parquet"); + + // Filtering AST - identity function, always true. + auto literal_value = cudf::numeric_scalar(true); + auto literal = cudf::ast::literal(literal_value); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(filter_expression); + auto result = cudf::io::read_parquet(read_opts); + + cudf::io::parquet_reader_options read_opts2 = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result2 = cudf::io::read_parquet(read_opts2); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *result2.tbl); +} + +TEST_F(ParquetReaderTest, FilterReferenceExpression) +{ + auto [src, filepath] = create_parquet_with_stats("FilterReferenceExpression.parquet"); + // Filtering AST - table[0] < 150 + auto literal_value = cudf::numeric_scalar(150); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_0 = cudf::ast::column_reference(0); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + // Expected result + auto predicate = cudf::compute_column(src, filter_expression); + auto expected = cudf::apply_boolean_mask(src, *predicate); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(filter_expression); + auto result = cudf::io::read_parquet(read_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetReaderTest, FilterNamedExpression) +{ + auto [src, filepath] = create_parquet_with_stats("NamedExpression.parquet"); + // Filtering AST - table["col_uint32"] < 150 + auto literal_value = cudf::numeric_scalar(150); + auto literal = cudf::ast::literal(literal_value); + auto col_name_0 = cudf::ast::column_name_reference("col_uint32"); + auto parquet_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); + auto col_ref_0 = cudf::ast::column_reference(0); + auto table_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + // Expected result + auto predicate = cudf::compute_column(src, table_filter); + auto expected = cudf::apply_boolean_mask(src, *predicate); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(parquet_filter); + auto result = cudf::io::read_parquet(read_opts); + + // tests + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetReaderTest, FilterMultiple1) +{ + using T = cudf::string_view; + + auto const [src, filepath] = create_parquet_typed_with_stats("FilterMultiple1.parquet"); + auto const written_table = src.view(); + + // Filtering AST - 10000 < table[0] < 12000 + std::string const low = "000010000"; + std::string const high = "000012000"; + auto lov = cudf::string_scalar(low, true); + auto hiv = cudf::string_scalar(high, true); + auto filter_col = cudf::ast::column_reference(0); + auto lo_lit = cudf::ast::literal(lov); + auto hi_lit = cudf::ast::literal(hiv); + auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit); + auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit); + auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); + + // Expected result + auto predicate = cudf::compute_column(written_table, expr_3); + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + auto si = cudf::io::source_info(filepath); + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_3); + auto table_with_metadata = cudf::io::read_parquet(builder); + auto result = table_with_metadata.tbl->view(); + + // tests + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); +} + +TEST_F(ParquetReaderTest, FilterMultiple2) +{ + // multiple conditions on same column. + using T = cudf::string_view; + + auto const [src, filepath] = create_parquet_typed_with_stats("FilterMultiple2.parquet"); + auto const written_table = src.view(); + // 0-8000, 8001-16000, 16001-20000 + + // Filtering AST + // (table[0] >= "000010000" AND table[0] < "000012000") OR + // (table[0] >= "000017000" AND table[0] < "000019000") + std::string const low1 = "000010000"; + std::string const high1 = "000012000"; + auto lov = cudf::string_scalar(low1, true); + auto hiv = cudf::string_scalar(high1, true); + auto filter_col = cudf::ast::column_reference(0); + auto lo_lit = cudf::ast::literal(lov); + auto hi_lit = cudf::ast::literal(hiv); + auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit); + auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit); + auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); + std::string const low2 = "000017000"; + std::string const high2 = "000019000"; + auto lov2 = cudf::string_scalar(low2, true); + auto hiv2 = cudf::string_scalar(high2, true); + auto lo_lit2 = cudf::ast::literal(lov2); + auto hi_lit2 = cudf::ast::literal(hiv2); + auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit2); + auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit2); + auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5); + auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6); + + // Expected result + auto predicate = cudf::compute_column(written_table, expr_7); + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + auto si = cudf::io::source_info(filepath); + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_7); + auto table_with_metadata = cudf::io::read_parquet(builder); + auto result = table_with_metadata.tbl->view(); + + // tests + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); +} + +TEST_F(ParquetReaderTest, FilterMultiple3) +{ + // multiple conditions with reference to multiple columns. + // index and name references mixed. + using T = uint32_t; + auto const [src, filepath] = create_parquet_typed_with_stats("FilterMultiple3.parquet"); + auto const written_table = src.view(); + + // Filtering AST - (table[0] >= 70 AND table[0] < 90) OR (table[1] >= 100 AND table[1] < 120) + // row groups min, max: + // table[0] 0-80, 81-160, 161-200. + // table[1] 200-121, 120-41, 40-0. + auto filter_col1 = cudf::ast::column_reference(0); + auto filter_col2 = cudf::ast::column_name_reference("col1"); + T constexpr low1 = 70; + T constexpr high1 = 90; + T constexpr low2 = 100; + T constexpr high2 = 120; + auto lov = cudf::numeric_scalar(low1, true); + auto hiv = cudf::numeric_scalar(high1, true); + auto lo_lit1 = cudf::ast::literal(lov); + auto hi_lit1 = cudf::ast::literal(hiv); + auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col1, lo_lit1); + auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, hi_lit1); + auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); + auto lov2 = cudf::numeric_scalar(low2, true); + auto hiv2 = cudf::numeric_scalar(high2, true); + auto lo_lit2 = cudf::ast::literal(lov2); + auto hi_lit2 = cudf::ast::literal(hiv2); + auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2, lo_lit2); + auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, hi_lit2); + auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5); + // expression to test + auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6); + + // Expected result + auto filter_col2_ref = cudf::ast::column_reference(1); + auto expr_4_ref = + cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2_ref, lo_lit2); + auto expr_5_ref = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2_ref, hi_lit2); + auto expr_6_ref = + cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4_ref, expr_5_ref); + auto expr_7_ref = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6_ref); + auto predicate = cudf::compute_column(written_table, expr_7_ref); + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + auto si = cudf::io::source_info(filepath); + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_7); + auto table_with_metadata = cudf::io::read_parquet(builder); + auto result = table_with_metadata.tbl->view(); + + // tests + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); +} + +TEST_F(ParquetReaderTest, FilterSupported) +{ + using T = uint32_t; + auto const [src, filepath] = create_parquet_typed_with_stats("FilterSupported.parquet"); + auto const written_table = src.view(); + + // Filtering AST - ((table[0] > 70 AND table[0] <= 90) OR (table[1] >= 100 AND table[1] < 120)) + // AND (table[1] != 110) + // row groups min, max: + // table[0] 0-80, 81-160, 161-200. + // table[1] 200-121, 120-41, 40-0. + auto filter_col1 = cudf::ast::column_reference(0); + auto filter_col2 = cudf::ast::column_reference(1); + T constexpr low1 = 70; + T constexpr high1 = 90; + T constexpr low2 = 100; + T constexpr high2 = 120; + T constexpr skip_value = 110; + auto lov = cudf::numeric_scalar(low1, true); + auto hiv = cudf::numeric_scalar(high1, true); + auto lo_lit1 = cudf::ast::literal(lov); + auto hi_lit1 = cudf::ast::literal(hiv); + auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER, filter_col1, lo_lit1); + auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, filter_col1, hi_lit1); + auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); + auto lov2 = cudf::numeric_scalar(low2, true); + auto hiv2 = cudf::numeric_scalar(high2, true); + auto lo_lit2 = cudf::ast::literal(lov2); + auto hi_lit2 = cudf::ast::literal(hiv2); + auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2, lo_lit2); + auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, hi_lit2); + auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5); + auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6); + auto skip_ov = cudf::numeric_scalar(skip_value, true); + auto skip_lit = cudf::ast::literal(skip_ov); + auto expr_8 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col2, skip_lit); + auto expr_9 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_7, expr_8); + + // Expected result + auto predicate = cudf::compute_column(written_table, expr_9); + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + auto si = cudf::io::source_info(filepath); + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_9); + auto table_with_metadata = cudf::io::read_parquet(builder); + auto result = table_with_metadata.tbl->view(); + + // tests + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); +} + +TEST_F(ParquetReaderTest, FilterSupported2) +{ + using T = uint32_t; + constexpr auto num_rows = 4000; + auto elements0 = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 2000; }); + auto elements1 = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 1000; }); + auto elements2 = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 500; }); + auto col0 = cudf::test::fixed_width_column_wrapper(elements0, elements0 + num_rows); + auto col1 = cudf::test::fixed_width_column_wrapper(elements1, elements1 + num_rows); + auto col2 = cudf::test::fixed_width_column_wrapper(elements2, elements2 + num_rows); + auto const written_table = table_view{{col0, col1, col2}}; + auto const filepath = temp_env->get_temp_filepath("FilterSupported2.parquet"); + { + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) + .row_group_size_rows(1000); + cudf::io::write_parquet(out_opts); + } + auto si = cudf::io::source_info(filepath); + auto filter_col0 = cudf::ast::column_reference(0); + auto filter_col1 = cudf::ast::column_reference(1); + auto filter_col2 = cudf::ast::column_reference(2); + auto s_value = cudf::numeric_scalar(1, true); + auto lit_value = cudf::ast::literal(s_value); + + auto test_expr = [&](auto& expr) { + // Expected result + auto predicate = cudf::compute_column(written_table, expr); + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + // tests + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); + auto table_with_metadata = cudf::io::read_parquet(builder); + auto result = table_with_metadata.tbl->view(); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); + }; + + // row groups min, max: + // table[0] 0-0, 0-0, 1-1, 1-1 + // table[1] 0-0, 1-1, 2-2, 3-3 + // table[2] 0-1, 2-3, 4-5, 6-7 + + // Filtering AST - table[i] == 1 + { + auto expr0 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col0, lit_value); + test_expr(expr0); + + auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col1, lit_value); + test_expr(expr1); + + auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col2, lit_value); + test_expr(expr2); + } + // Filtering AST - table[i] != 1 + { + auto expr0 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col0, lit_value); + test_expr(expr0); + + auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col1, lit_value); + test_expr(expr1); + + auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col2, lit_value); + test_expr(expr2); + } +} + +// Error types - type mismatch, invalid column name, invalid literal type, invalid operator, +// non-bool filter output type. +TEST_F(ParquetReaderTest, FilterErrors) +{ + using T = uint32_t; + auto const [src, filepath] = create_parquet_typed_with_stats("FilterErrors.parquet"); + auto const written_table = src.view(); + auto si = cudf::io::source_info(filepath); + + // Filtering AST - invalid column index + { + auto filter_col1 = cudf::ast::column_reference(3); + T constexpr low = 100; + auto lov = cudf::numeric_scalar(low, true); + auto low_lot = cudf::ast::literal(lov); + auto expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); + + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); + EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); + } + + // Filtering AST - invalid column name + { + auto filter_col1 = cudf::ast::column_name_reference("col3"); + T constexpr low = 100; + auto lov = cudf::numeric_scalar(low, true); + auto low_lot = cudf::ast::literal(lov); + auto expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); + EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); + } + + // Filtering AST - incompatible literal type + { + auto filter_col1 = cudf::ast::column_name_reference("col0"); + auto filter_col2 = cudf::ast::column_reference(1); + int64_t constexpr low = 100; + auto lov = cudf::numeric_scalar(low, true); + auto low_lot = cudf::ast::literal(lov); + auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); + auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, low_lot); + auto builder1 = cudf::io::parquet_reader_options::builder(si).filter(expr1); + EXPECT_THROW(cudf::io::read_parquet(builder1), cudf::logic_error); + + auto builder2 = cudf::io::parquet_reader_options::builder(si).filter(expr2); + EXPECT_THROW(cudf::io::read_parquet(builder2), cudf::logic_error); + } + + // Filtering AST - "table[0] + 110" is invalid filter expression + { + auto filter_col1 = cudf::ast::column_reference(0); + T constexpr add_value = 110; + auto add_v = cudf::numeric_scalar(add_value, true); + auto add_lit = cudf::ast::literal(add_v); + auto expr_8 = cudf::ast::operation(cudf::ast::ast_operator::ADD, filter_col1, add_lit); + + auto si = cudf::io::source_info(filepath); + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_8); + EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); + + // Expected result throw to show that the filter expression is invalid, + // not a limitation of the parquet predicate pushdown. + auto predicate = cudf::compute_column(written_table, expr_8); + EXPECT_THROW(cudf::apply_boolean_mask(written_table, *predicate), cudf::logic_error); + } + + // Filtering AST - INT64(table[0] < 100) non-bool expression + { + auto filter_col1 = cudf::ast::column_reference(0); + T constexpr low = 100; + auto lov = cudf::numeric_scalar(low, true); + auto low_lot = cudf::ast::literal(lov); + auto bool_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); + auto cast = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, bool_expr); + + auto builder = cudf::io::parquet_reader_options::builder(si).filter(cast); + EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); + EXPECT_NO_THROW(cudf::compute_column(written_table, cast)); + auto predicate = cudf::compute_column(written_table, cast); + EXPECT_NE(predicate->view().type().id(), cudf::type_id::BOOL8); + } +} + +// Filter without stats information in file. +TEST_F(ParquetReaderTest, FilterNoStats) +{ + using T = uint32_t; + constexpr auto num_rows = 16000; + auto elements = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 1000; }); + auto col0 = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); + auto const written_table = table_view{{col0}}; + auto const filepath = temp_env->get_temp_filepath("FilterNoStats.parquet"); + { + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) + .row_group_size_rows(8000) + .stats_level(cudf::io::statistics_freq::STATISTICS_NONE); + cudf::io::write_parquet(out_opts); + } + auto si = cudf::io::source_info(filepath); + auto filter_col0 = cudf::ast::column_reference(0); + auto s_value = cudf::numeric_scalar(1, true); + auto lit_value = cudf::ast::literal(s_value); + + // row groups min, max: + // table[0] 0-0, 1-1, 2-2, 3-3 + // Filtering AST - table[0] > 1 + auto expr = cudf::ast::operation(cudf::ast::ast_operator::GREATER, filter_col0, lit_value); + + // Expected result + auto predicate = cudf::compute_column(written_table, expr); + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + // tests + auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); + auto table_with_metadata = cudf::io::read_parquet(builder); + auto result = table_with_metadata.tbl->view(); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); +} + +// Filter for float column with NaN values +TEST_F(ParquetReaderTest, FilterFloatNAN) +{ + constexpr auto num_rows = 24000; + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [num_rows](auto i) { return i > num_rows / 2 ? NAN : i; }); + auto col0 = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); + auto col1 = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); + + auto const written_table = table_view{{col0, col1}}; + auto const filepath = temp_env->get_temp_filepath("FilterFloatNAN.parquet"); + { + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) + .row_group_size_rows(8000); + cudf::io::write_parquet(out_opts); + } + auto si = cudf::io::source_info(filepath); + auto filter_col0 = cudf::ast::column_reference(0); + auto filter_col1 = cudf::ast::column_reference(1); + auto s0_value = cudf::numeric_scalar(NAN, true); + auto lit0_value = cudf::ast::literal(s0_value); + auto s1_value = cudf::numeric_scalar(NAN, true); + auto lit1_value = cudf::ast::literal(s1_value); + + // row groups min, max: + // table[0] 0-0, 1-1, 2-2, 3-3 + // Filtering AST - table[0] == NAN, table[1] != NAN + auto expr_eq = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col0, lit0_value); + auto expr_neq = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col1, lit1_value); + + // Expected result + auto predicate0 = cudf::compute_column(written_table, expr_eq); + auto expected0 = cudf::apply_boolean_mask(written_table, *predicate0); + auto predicate1 = cudf::compute_column(written_table, expr_neq); + auto expected1 = cudf::apply_boolean_mask(written_table, *predicate1); + + // tests + auto builder0 = cudf::io::parquet_reader_options::builder(si).filter(expr_eq); + auto table_with_metadata0 = cudf::io::read_parquet(builder0); + auto result0 = table_with_metadata0.tbl->view(); + auto builder1 = cudf::io::parquet_reader_options::builder(si).filter(expr_neq); + auto table_with_metadata1 = cudf::io::read_parquet(builder1); + auto result1 = table_with_metadata1.tbl->view(); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected0->view(), result0); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1->view(), result1); +} + +TEST_F(ParquetReaderTest, RepeatedNoAnnotations) +{ + constexpr unsigned char repeated_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, + 0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02, + 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15, + 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15, + 0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01, + 0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00, + 0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15, + 0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02, + 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02, + 0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00, + 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d, + 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10, + 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c, + 0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69, + 0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04, + 0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c, + 0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19, + 0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, + 0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda, + 0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, + 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, + 0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16, + 0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75, + 0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39, + 0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38, + 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + auto read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); + auto result = cudf::io::read_parquet(read_opts); + + EXPECT_EQ(result.tbl->view().column(0).size(), 6); + EXPECT_EQ(result.tbl->view().num_columns(), 2); + + column_wrapper col0{1, 2, 3, 4, 5, 6}; + column_wrapper child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}}; + cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}}; + auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}}; + + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 1, 2, 5}.release(); + auto num_list_rows = list_offsets_column->size() - 1; + + auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 2, false); + + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask)); + + std::vector> struct_children; + struct_children.push_back(std::move(list_col)); + + auto outer_struct = + cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}}; + table_view expected{{col0, outer_struct}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected); +} + +TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls) +{ + constexpr int num_rows = 50'000; + constexpr auto seed = 21337; + + std::mt19937 engine{seed}; + auto int32_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); + auto int32_list = make_parquet_list_col(engine, num_rows, 5, false); + auto int64_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); + auto int64_list = make_parquet_list_col(engine, num_rows, 5, false); + auto int16_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); + auto int16_list = make_parquet_list_col(engine, num_rows, 5, false); + auto int8_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); + auto int8_list = make_parquet_list_col(engine, num_rows, 5, false); + + auto str_list_nulls = make_parquet_string_list_col(engine, num_rows, 5, 32, true); + auto str_list = make_parquet_string_list_col(engine, num_rows, 5, 32, false); + auto big_str_list_nulls = make_parquet_string_list_col(engine, num_rows, 5, 256, true); + auto big_str_list = make_parquet_string_list_col(engine, num_rows, 5, 256, false); + + auto int32_data = random_values(num_rows); + auto int64_data = random_values(num_rows); + auto int16_data = random_values(num_rows); + auto int8_data = random_values(num_rows); + auto str_data = string_values(engine, num_rows, 32); + auto big_str_data = string_values(engine, num_rows, 256); + + auto const validity = random_validity(engine); + auto const no_nulls = cudf::test::iterators::no_nulls(); + column_wrapper int32_nulls_col{int32_data.begin(), int32_data.end(), validity}; + column_wrapper int32_col{int32_data.begin(), int32_data.end(), no_nulls}; + column_wrapper int64_nulls_col{int64_data.begin(), int64_data.end(), validity}; + column_wrapper int64_col{int64_data.begin(), int64_data.end(), no_nulls}; + + auto str_col = cudf::test::strings_column_wrapper(str_data.begin(), str_data.end(), no_nulls); + auto str_col_nulls = cudf::purge_nonempty_nulls( + cudf::test::strings_column_wrapper(str_data.begin(), str_data.end(), validity)); + auto big_str_col = + cudf::test::strings_column_wrapper(big_str_data.begin(), big_str_data.end(), no_nulls); + auto big_str_col_nulls = cudf::purge_nonempty_nulls( + cudf::test::strings_column_wrapper(big_str_data.begin(), big_str_data.end(), validity)); + + cudf::table_view tbl({int32_col, int32_nulls_col, *int32_list, *int32_list_nulls, + int64_col, int64_nulls_col, *int64_list, *int64_list_nulls, + *int16_list, *int16_list_nulls, *int8_list, *int8_list_nulls, + str_col, *str_col_nulls, *str_list, *str_list_nulls, + big_str_col, *big_str_col_nulls, *big_str_list, *big_str_list_nulls}); + + auto const filepath = temp_env->get_temp_filepath("DeltaSkipRowsWithNulls.parquet"); + auto const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .compression(cudf::io::compression_type::NONE) + .dictionary_policy(cudf::io::dictionary_policy::NEVER) + .max_page_size_rows(20'000) + .write_v2_headers(true) + .build(); + cudf::io::write_parquet(out_opts); + + // skip_rows / num_rows + // clang-format off + std::vector> params{ + // skip and then read rest of file + {-1, -1}, {1, -1}, {2, -1}, {32, -1}, {33, -1}, {128, -1}, {1000, -1}, + // no skip but read fewer rows + {0, 1}, {0, 2}, {0, 31}, {0, 32}, {0, 33}, {0, 128}, {0, 129}, {0, 130}, + // skip and truncate + {1, 32}, {1, 33}, {32, 32}, {33, 139}, + // cross page boundaries + {10'000, 20'000} + }; + + // clang-format on + for (auto p : params) { + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + if (p.first >= 0) { read_args.set_skip_rows(p.first); } + if (p.second >= 0) { read_args.set_num_rows(p.second); } + auto result = cudf::io::read_parquet(read_args); + + p.first = p.first < 0 ? 0 : p.first; + p.second = p.second < 0 ? num_rows - p.first : p.second; + std::vector slice_indices{p.first, p.first + p.second}; + std::vector expected = cudf::slice(tbl, slice_indices); + + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected[0]); + + // test writing the result back out as a further check of the delta writer's correctness + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts2 = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, + result.tbl->view()) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .compression(cudf::io::compression_type::NONE) + .dictionary_policy(cudf::io::dictionary_policy::NEVER) + .max_page_size_rows(20'000) + .write_v2_headers(true); + cudf::io::write_parquet(out_opts2); + + cudf::io::parquet_reader_options default_in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + auto const result2 = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), result2.tbl->view()); + } +} + +/////////////////// +// metadata tests + +// Test fixture for metadata tests +struct ParquetMetadataReaderTest : public cudf::test::BaseFixture { + std::string print(cudf::io::parquet_column_schema schema, int depth = 0) + { + std::string child_str; + for (auto const& child : schema.children()) { + child_str += print(child, depth + 1); + } + return std::string(depth, ' ') + schema.name() + "\n" + child_str; + } +}; + +TEST_F(ParquetMetadataReaderTest, TestBasic) +{ + auto const num_rows = 1200; + + auto ints = random_values(num_rows); + auto floats = random_values(num_rows); + column_wrapper int_col(ints.begin(), ints.end()); + column_wrapper float_col(floats.begin(), floats.end()); + + table_view expected({int_col, float_col}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("int_col"); + expected_metadata.column_metadata[1].set_name("float_col"); + + auto filepath = temp_env->get_temp_filepath("MetadataTest.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(expected_metadata)); + cudf::io::write_parquet(out_opts); + + auto meta = read_parquet_metadata(cudf::io::source_info{filepath}); + EXPECT_EQ(meta.num_rows(), num_rows); + + std::string expected_schema = R"(schema + int_col + float_col +)"; + EXPECT_EQ(expected_schema, print(meta.schema().root())); + + EXPECT_EQ(meta.schema().root().name(), "schema"); + EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); + ASSERT_EQ(meta.schema().root().num_children(), 2); + + EXPECT_EQ(meta.schema().root().child(0).name(), "int_col"); + EXPECT_EQ(meta.schema().root().child(1).name(), "float_col"); +} + +TEST_F(ParquetMetadataReaderTest, TestNested) +{ + auto const num_rows = 1200; + auto const lists_per_row = 4; + auto const num_child_rows = num_rows * lists_per_row; + + auto keys = random_values(num_child_rows); + auto vals = random_values(num_child_rows); + column_wrapper keys_col(keys.begin(), keys.end()); + column_wrapper vals_col(vals.begin(), vals.end()); + auto s_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release(); + + std::vector row_offsets(num_rows + 1); + for (int idx = 0; idx < num_rows + 1; ++idx) { + row_offsets[idx] = idx * lists_per_row; + } + column_wrapper offsets(row_offsets.begin(), row_offsets.end()); + + auto list_col = + cudf::make_lists_column(num_rows, offsets.release(), std::move(s_col), 0, rmm::device_buffer{}); + + table_view expected({*list_col, *list_col}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("maps"); + expected_metadata.column_metadata[0].set_list_column_as_map(); + expected_metadata.column_metadata[1].set_name("lists"); + expected_metadata.column_metadata[1].child(1).child(0).set_name("int_field"); + expected_metadata.column_metadata[1].child(1).child(1).set_name("float_field"); + + auto filepath = temp_env->get_temp_filepath("MetadataTest.orc"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(expected_metadata)); + cudf::io::write_parquet(out_opts); + + auto meta = read_parquet_metadata(cudf::io::source_info{filepath}); + EXPECT_EQ(meta.num_rows(), num_rows); + + std::string expected_schema = R"(schema + maps + key_value + key + value + lists + list + element + int_field + float_field +)"; + EXPECT_EQ(expected_schema, print(meta.schema().root())); + + EXPECT_EQ(meta.schema().root().name(), "schema"); + EXPECT_EQ(meta.schema().root().type_kind(), + cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // struct + ASSERT_EQ(meta.schema().root().num_children(), 2); + + auto const& out_map_col = meta.schema().root().child(0); + EXPECT_EQ(out_map_col.name(), "maps"); + EXPECT_EQ(out_map_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // map + + ASSERT_EQ(out_map_col.num_children(), 1); + EXPECT_EQ(out_map_col.child(0).name(), "key_value"); // key_value (named in parquet writer) + ASSERT_EQ(out_map_col.child(0).num_children(), 2); + EXPECT_EQ(out_map_col.child(0).child(0).name(), "key"); // key (named in parquet writer) + EXPECT_EQ(out_map_col.child(0).child(1).name(), "value"); // value (named in parquet writer) + EXPECT_EQ(out_map_col.child(0).child(0).type_kind(), cudf::io::parquet::TypeKind::INT32); // int + EXPECT_EQ(out_map_col.child(0).child(1).type_kind(), + cudf::io::parquet::TypeKind::FLOAT); // float + + auto const& out_list_col = meta.schema().root().child(1); + EXPECT_EQ(out_list_col.name(), "lists"); + EXPECT_EQ(out_list_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // list + // TODO repetition type? + ASSERT_EQ(out_list_col.num_children(), 1); + EXPECT_EQ(out_list_col.child(0).name(), "list"); // list (named in parquet writer) + ASSERT_EQ(out_list_col.child(0).num_children(), 1); + + auto const& out_list_struct_col = out_list_col.child(0).child(0); + EXPECT_EQ(out_list_struct_col.name(), "element"); // elements (named in parquet writer) + EXPECT_EQ(out_list_struct_col.type_kind(), + cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // struct + ASSERT_EQ(out_list_struct_col.num_children(), 2); + + auto const& out_int_col = out_list_struct_col.child(0); + EXPECT_EQ(out_int_col.name(), "int_field"); + EXPECT_EQ(out_int_col.type_kind(), cudf::io::parquet::TypeKind::INT32); + + auto const& out_float_col = out_list_struct_col.child(1); + EXPECT_EQ(out_float_col.name(), "float_field"); + EXPECT_EQ(out_float_col.type_kind(), cudf::io::parquet::TypeKind::FLOAT); +} + +/////////////////////// +// reader source tests + +template +struct ParquetReaderSourceTest : public ParquetReaderTest {}; + +TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes); + +TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes) +{ + using T = TypeParam; + + srand(31337); + auto table = create_random_fixed_table(5, 5, true); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); + cudf::io::write_parquet(out_opts); + + { + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info( + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()))); + auto const result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); + } + + { + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span( + reinterpret_cast(out_buffer.data()), out_buffer.size()))); + auto const result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); + } +} + +TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes) +{ + using T = TypeParam; + + srand(31337); + auto table = create_random_fixed_table(5, 5, true); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); + cudf::io::write_parquet(out_opts); + + auto full_table = cudf::concatenate(std::vector({*table, *table})); + + { + auto spans = std::vector>{ + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); + auto const result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); + } + + { + auto spans = std::vector>{ + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), + cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); + auto const result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); + } +} + +////////////////////////////// +// predicate pushdown tests + +// Test for Types - numeric, chrono, string. +template +struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest {}; + +TYPED_TEST_SUITE(ParquetReaderPredicatePushdownTest, SupportedTestTypes); + +TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) +{ + using T = TypeParam; + + auto const [src, filepath] = create_parquet_typed_with_stats("FilterTyped.parquet"); + auto const written_table = src.view(); + + // Filtering AST + auto literal_value = []() { + if constexpr (cudf::is_timestamp()) { + // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds + return cudf::timestamp_scalar(T(typename T::duration(10000))); // i (0-20,000) + } else if constexpr (cudf::is_duration()) { + // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds + return cudf::duration_scalar(T(10000)); // i (0-20,000) + } else if constexpr (std::is_same_v) { + // table[0] < "000010000" + return cudf::string_scalar("000010000"); // i (0-20,000) + } else { + // table[0] < 0 or 100u + return cudf::numeric_scalar((100 - 100 * std::is_signed_v)); // i/100 (-100-100/ 0-200) + } + }(); + auto literal = cudf::ast::literal(literal_value); + auto col_name_0 = cudf::ast::column_name_reference("col0"); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); + auto col_ref_0 = cudf::ast::column_reference(0); + auto ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + // Expected result + auto predicate = cudf::compute_column(written_table, ref_filter); + EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) + << "Predicate filter should return a boolean"; + auto expected = cudf::apply_boolean_mask(written_table, *predicate); + + // Reading with Predicate Pushdown + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(filter_expression); + auto result = cudf::io::read_parquet(read_opts); + auto result_table = result.tbl->view(); + + // tests + EXPECT_EQ(int(written_table.column(0).type().id()), int(result_table.column(0).type().id())) + << "col0 type mismatch"; + // To make sure AST filters out some elements + EXPECT_LT(expected->num_rows(), written_table.num_rows()); + EXPECT_EQ(result_table.num_rows(), expected->num_rows()); + EXPECT_EQ(result_table.num_columns(), expected->num_columns()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); +} diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 785a398d716..be2ecd56424 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7290 +15,12 @@ */ #include -#include -#include -#include -#include -#include -#include -#include +#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -#include - -#include -#include -#include - -template -using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; -using column = cudf::column; -using table = cudf::table; -using table_view = cudf::table_view; - -// Global environment for temporary files -auto const temp_env = static_cast( - ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); - -template -std::unique_ptr create_fixed_table(cudf::size_type num_columns, - cudf::size_type num_rows, - bool include_validity, - Elements elements) -{ - auto valids = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector> src_cols(num_columns); - for (int idx = 0; idx < num_columns; idx++) { - if (include_validity) { - src_cols[idx] = - cudf::test::fixed_width_column_wrapper(elements, elements + num_rows, valids); - } else { - src_cols[idx] = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); - } - } - std::vector> columns(num_columns); - std::transform(src_cols.begin(), - src_cols.end(), - columns.begin(), - [](cudf::test::fixed_width_column_wrapper& in) { - auto ret = in.release(); - // pre-cache the null count - [[maybe_unused]] auto const nulls = ret->has_nulls(); - return ret; - }); - return std::make_unique(std::move(columns)); -} - -template -std::unique_ptr create_random_fixed_table(cudf::size_type num_columns, - cudf::size_type num_rows, - bool include_validity) -{ - auto rand_elements = - cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); }); - return create_fixed_table(num_columns, num_rows, include_validity, rand_elements); -} - -template -std::unique_ptr create_compressible_fixed_table(cudf::size_type num_columns, - cudf::size_type num_rows, - cudf::size_type period, - bool include_validity) -{ - auto compressible_elements = - cudf::detail::make_counting_transform_iterator(0, [period](T i) { return i / period; }); - return create_fixed_table(num_columns, num_rows, include_validity, compressible_elements); -} - -// this function replicates the "list_gen" function in -// python/cudf/cudf/tests/test_parquet.py -template -std::unique_ptr make_parquet_list_list_col( - int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity) -{ - auto valids = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0 ? 1 : 0; }); - - // root list - std::vector row_offsets(num_rows + 1); - int row_offset_count = 0; - { - int offset = 0; - for (int idx = 0; idx < (num_rows) + 1; idx++) { - row_offsets[row_offset_count] = offset; - if (!include_validity || valids[idx]) { offset += lists_per_row; } - row_offset_count++; - } - } - cudf::test::fixed_width_column_wrapper offsets(row_offsets.begin(), - row_offsets.begin() + row_offset_count); - - // child list - std::vector child_row_offsets((num_rows * lists_per_row) + 1); - int child_row_offset_count = 0; - { - int offset = 0; - for (int idx = 0; idx < (num_rows * lists_per_row); idx++) { - int row_index = idx / lists_per_row; - if (include_validity && !valids[row_index]) { continue; } - - child_row_offsets[child_row_offset_count] = offset; - offset += list_size; - child_row_offset_count++; - } - child_row_offsets[child_row_offset_count++] = offset; - } - cudf::test::fixed_width_column_wrapper child_offsets( - child_row_offsets.begin(), child_row_offsets.begin() + child_row_offset_count); - - // child values - std::vector child_values(num_rows * lists_per_row * list_size); - T first_child_value_index = skip_rows * lists_per_row * list_size; - int child_value_count = 0; - { - for (int idx = 0; idx < (num_rows * lists_per_row * list_size); idx++) { - int row_index = idx / (lists_per_row * list_size); - - int val = first_child_value_index; - first_child_value_index++; - - if (include_validity && !valids[row_index]) { continue; } - - child_values[child_value_count] = val; - child_value_count++; - } - } - // validity by value instead of index - auto valids2 = cudf::detail::make_counting_transform_iterator( - 0, [list_size](auto i) { return (i % list_size) % 2 == 0 ? 1 : 0; }); - auto child_data = include_validity - ? cudf::test::fixed_width_column_wrapper( - child_values.begin(), child_values.begin() + child_value_count, valids2) - : cudf::test::fixed_width_column_wrapper( - child_values.begin(), child_values.begin() + child_value_count); - - int child_offsets_size = static_cast(child_offsets).size() - 1; - auto child = cudf::make_lists_column( - child_offsets_size, child_offsets.release(), child_data.release(), 0, rmm::device_buffer{}); - - int offsets_size = static_cast(offsets).size() - 1; - auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + offsets_size); - return include_validity - ? cudf::make_lists_column( - offsets_size, offsets.release(), std::move(child), null_count, std::move(null_mask)) - : cudf::make_lists_column( - offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{}); -} - -// given a datasource pointing to a parquet file, read the footer -// of the file to populate the FileMetaData pointed to by file_meta_data. -// throws cudf::logic_error if the file or metadata is invalid. -void read_footer(std::unique_ptr const& source, - cudf::io::parquet::detail::FileMetaData* file_meta_data) -{ - constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); - constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); - - auto const len = source->size(); - auto const header_buffer = source->host_read(0, header_len); - auto const header = - reinterpret_cast(header_buffer->data()); - auto const ender_buffer = source->host_read(len - ender_len, ender_len); - auto const ender = - reinterpret_cast(ender_buffer->data()); - - // checks for valid header, footer, and file length - ASSERT_GT(len, header_len + ender_len); - ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && - ender->magic == cudf::io::parquet::detail::parquet_magic); - ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); - - // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" - // seek backwards from the end of the file (footer_length + 8 bytes of ender) - auto const footer_buffer = - source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); - - cp.read(file_meta_data); -} - -// returns the number of bits used for dictionary encoding data at the given page location. -// this assumes the data is uncompressed. -// throws cudf::logic_error if the page_loc data is invalid. -int read_dict_bits(std::unique_ptr const& source, - cudf::io::parquet::detail::PageLocation const& page_loc) -{ - CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); - CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - - cudf::io::parquet::detail::PageHeader page_hdr; - auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); - cp.read(&page_hdr); - - // cp should be pointing at the start of page data now. the first byte - // should be the encoding bit size - return cp.getb(); -} - -// read column index from datasource at location indicated by chunk, -// parse and return as a ColumnIndex struct. -// throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::detail::ColumnIndex read_column_index( - std::unique_ptr const& source, - cudf::io::parquet::detail::ColumnChunk const& chunk) -{ - CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); - CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); - - cudf::io::parquet::detail::ColumnIndex colidx; - auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); - cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); - cp.read(&colidx); - return colidx; -} - -// read offset index from datasource at location indicated by chunk, -// parse and return as an OffsetIndex struct. -// throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::detail::OffsetIndex read_offset_index( - std::unique_ptr const& source, - cudf::io::parquet::detail::ColumnChunk const& chunk) -{ - CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); - CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); - - cudf::io::parquet::detail::OffsetIndex offidx; - auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); - cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); - cp.read(&offidx); - return offidx; -} - -// Return as a Statistics from the column chunk -cudf::io::parquet::detail::Statistics const& get_statistics( - cudf::io::parquet::detail::ColumnChunk const& chunk) -{ - return chunk.meta_data.statistics; -} - -// read page header from datasource at location indicated by page_loc, -// parse and return as a PageHeader struct. -// throws cudf::logic_error if the page_loc data is invalid. -cudf::io::parquet::detail::PageHeader read_page_header( - std::unique_ptr const& source, - cudf::io::parquet::detail::PageLocation const& page_loc) -{ - CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); - CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - - cudf::io::parquet::detail::PageHeader page_hdr; - auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); - cp.read(&page_hdr); - return page_hdr; -} - -// Base test fixture for tests -struct ParquetWriterTest : public cudf::test::BaseFixture {}; - -// Base test fixture for tests -struct ParquetReaderTest : public cudf::test::BaseFixture {}; - -// Base test fixture for "stress" tests -struct ParquetWriterStressTest : public cudf::test::BaseFixture {}; - -// Typed test fixture for numeric type tests -template -struct ParquetWriterNumericTypeTest : public ParquetWriterTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } -}; - -// Typed test fixture for comparable type tests -template -struct ParquetWriterComparableTypeTest : public ParquetWriterTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } -}; - -// Typed test fixture for timestamp type tests -template -struct ParquetWriterChronoTypeTest : public ParquetWriterTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } -}; - -// Typed test fixture for timestamp type tests -template -struct ParquetWriterTimestampTypeTest : public ParquetWriterTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } -}; - -// Typed test fixture for all types -template -struct ParquetWriterSchemaTest : public ParquetWriterTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } -}; - -template -struct ParquetReaderSourceTest : public ParquetReaderTest {}; - -template -struct ParquetWriterDeltaTest : public ParquetWriterTest {}; - -// Declare typed test cases -// TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 -using SupportedTypes = cudf::test::Types; -TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes); -using ComparableAndFixedTypes = - cudf::test::Concat; -TYPED_TEST_SUITE(ParquetWriterComparableTypeTest, ComparableAndFixedTypes); -TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, cudf::test::ChronoTypes); -using SupportedTimestampTypes = - cudf::test::Types; -TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes); -TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes); -using ByteLikeTypes = cudf::test::Types; -TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes); - -// Base test fixture for chunked writer tests -struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {}; - -// Typed test fixture for numeric type tests -template -struct ParquetChunkedWriterNumericTypeTest : public ParquetChunkedWriterTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } -}; - -// Declare typed test cases -TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes); - -// Base test fixture for size-parameterized tests -class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam {}; - -// test the allowed bit widths for dictionary encoding -INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest, - ParquetSizedTest, - testing::Range(1, 25), - testing::PrintToStringParamName()); - -// Base test fixture for V2 header tests -class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam {}; -INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest, - ParquetV2Test, - testing::Bool(), - testing::PrintToStringParamName()); - -namespace { -// Generates a vector of uniform random values of type T -template -inline auto random_values(size_t size) -{ - std::vector values(size); - - using T1 = T; - using uniform_distribution = - typename std::conditional_t, - std::bernoulli_distribution, - std::conditional_t, - std::uniform_real_distribution, - std::uniform_int_distribution>>; - - static constexpr auto seed = 0xf00d; - static std::mt19937 engine{seed}; - static uniform_distribution dist{}; - std::generate_n(values.begin(), size, [&]() { return T{dist(engine)}; }); - - return values; -} - -} // namespace - -TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn) -{ - auto sequence = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); }); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - constexpr auto num_rows = 800; - column_wrapper col(sequence, sequence + num_rows, validity); - - auto expected = table_view{{col}}; - - auto filepath = temp_env->get_temp_filepath("SingleColumn.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumnWithNulls) -{ - auto sequence = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i); }); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); - - constexpr auto num_rows = 100; - column_wrapper col(sequence, sequence + num_rows, validity); - - auto expected = table_view{{col}}; - - auto filepath = temp_env->get_temp_filepath("SingleColumnWithNulls.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -template -void test_durations(mask_op_t mask_op) -{ - std::default_random_engine generator; - std::uniform_int_distribution distribution_d(0, 30); - auto sequence_d = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return distribution_d(generator); }); - - std::uniform_int_distribution distribution_s(0, 86400); - auto sequence_s = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return distribution_s(generator); }); - - std::uniform_int_distribution distribution(0, 86400 * 1000); - auto sequence = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return distribution(generator); }); - - auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op); - - constexpr auto num_rows = 100; - // Durations longer than a day are not exactly valid, but cudf should be able to round trip - auto durations_d = cudf::test::fixed_width_column_wrapper( - sequence_d, sequence_d + num_rows, mask); - auto durations_s = cudf::test::fixed_width_column_wrapper( - sequence_s, sequence_s + num_rows, mask); - auto durations_ms = cudf::test::fixed_width_column_wrapper( - sequence, sequence + num_rows, mask); - auto durations_us = cudf::test::fixed_width_column_wrapper( - sequence, sequence + num_rows, mask); - auto durations_ns = cudf::test::fixed_width_column_wrapper( - sequence, sequence + num_rows, mask); - - auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}}; - - auto filepath = temp_env->get_temp_filepath("Durations.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - auto durations_d_got = - cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); - - auto durations_s_got = - cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4)); -} - -TEST_F(ParquetWriterTest, Durations) -{ - test_durations([](auto i) { return true; }); - test_durations([](auto i) { return (i % 2) != 0; }); - test_durations([](auto i) { return (i % 3) != 0; }); - test_durations([](auto i) { return false; }); -} - -TYPED_TEST(ParquetWriterTimestampTypeTest, Timestamps) -{ - auto sequence = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return ((std::rand() / 10000) * 1000); }); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - constexpr auto num_rows = 100; - column_wrapper col( - sequence, sequence + num_rows, validity); - - auto expected = table_view{{col}}; - - auto filepath = temp_env->get_temp_filepath("Timestamps.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .timestamp_type(this->type()); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampsWithNulls) -{ - auto sequence = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return ((std::rand() / 10000) * 1000); }); - auto validity = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 30) && (i < 60); }); - - constexpr auto num_rows = 100; - column_wrapper col( - sequence, sequence + num_rows, validity); - - auto expected = table_view{{col}}; - - auto filepath = temp_env->get_temp_filepath("TimestampsWithNulls.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .timestamp_type(this->type()); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow) -{ - constexpr int64_t max = std::numeric_limits::max(); - auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; }); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - constexpr auto num_rows = 100; - column_wrapper col( - sequence, sequence + num_rows, validity); - table_view expected({col}); - - auto filepath = temp_env->get_temp_filepath("ParquetTimestampOverflow.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .timestamp_type(this->type()); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TEST_P(ParquetV2Test, MultiColumn) -{ - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - - // auto col0_data = random_values(num_rows); - auto col1_data = random_values(num_rows); - auto col2_data = random_values(num_rows); - auto col3_data = random_values(num_rows); - auto col4_data = random_values(num_rows); - auto col5_data = random_values(num_rows); - auto col6_vals = random_values(num_rows); - auto col7_vals = random_values(num_rows); - auto col8_vals = random_values(num_rows); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) { - return numeric::decimal32{col6_vals[i], numeric::scale_type{5}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) { - return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}}; - }); - auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col8_vals](auto i) { - return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}}; - }); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - // column_wrapper col0{ - // col0_data.begin(), col0_data.end(), validity}; - column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; - column_wrapper col2{col2_data.begin(), col2_data.end(), validity}; - column_wrapper col3{col3_data.begin(), col3_data.end(), validity}; - column_wrapper col4{col4_data.begin(), col4_data.end(), validity}; - column_wrapper col5{col5_data.begin(), col5_data.end(), validity}; - column_wrapper col6{col6_data, col6_data + num_rows, validity}; - column_wrapper col7{col7_data, col7_data + num_rows, validity}; - column_wrapper col8{col8_data, col8_data + num_rows, validity}; - - auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}}; - - cudf::io::table_input_metadata expected_metadata(expected); - // expected_metadata.column_metadata[0].set_name( "bools"); - expected_metadata.column_metadata[0].set_name("int8s"); - expected_metadata.column_metadata[1].set_name("int16s"); - expected_metadata.column_metadata[2].set_name("int32s"); - expected_metadata.column_metadata[3].set_name("floats"); - expected_metadata.column_metadata[4].set_name("doubles"); - expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(10); - expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20); - expected_metadata.column_metadata[7].set_name("decimal128s").set_decimal_precision(40); - - auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(is_v2) - .metadata(expected_metadata); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_P(ParquetV2Test, MultiColumnWithNulls) -{ - constexpr auto num_rows = 100; - auto const is_v2 = GetParam(); - - // auto col0_data = random_values(num_rows); - auto col1_data = random_values(num_rows); - auto col2_data = random_values(num_rows); - auto col3_data = random_values(num_rows); - auto col4_data = random_values(num_rows); - auto col5_data = random_values(num_rows); - auto col6_vals = random_values(num_rows); - auto col7_vals = random_values(num_rows); - auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) { - return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}}; - }); - auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) { - return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}}; - }); - // auto col0_mask = cudf::detail::make_counting_transform_iterator( - // 0, [](auto i) { return (i % 2); }); - auto col1_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); - auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - auto col3_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); }); - auto col4_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); }); - auto col5_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); }); - auto col6_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 5); }); - auto col7_mask = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i != 55); }); - - // column_wrapper col0{ - // col0_data.begin(), col0_data.end(), col0_mask}; - column_wrapper col1{col1_data.begin(), col1_data.end(), col1_mask}; - column_wrapper col2{col2_data.begin(), col2_data.end(), col2_mask}; - column_wrapper col3{col3_data.begin(), col3_data.end(), col3_mask}; - column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; - column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; - column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; - column_wrapper col7{col7_data, col7_data + num_rows, col7_mask}; - - auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}}; - - cudf::io::table_input_metadata expected_metadata(expected); - // expected_metadata.column_names.emplace_back("bools"); - expected_metadata.column_metadata[0].set_name("int8s"); - expected_metadata.column_metadata[1].set_name("int16s"); - expected_metadata.column_metadata[2].set_name("int32s"); - expected_metadata.column_metadata[3].set_name("floats"); - expected_metadata.column_metadata[4].set_name("doubles"); - expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(9); - expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20); - - auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(is_v2) - .metadata(expected_metadata); - - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - // TODO: Need to be able to return metadata in tree form from reader so they can be compared. - // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not - // have any tests for it c++ or python. - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_P(ParquetV2Test, Strings) -{ - auto const is_v2 = GetParam(); - - std::vector strings{ - "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; - auto const num_rows = strings.size(); - - auto seq_col0 = random_values(num_rows); - auto seq_col2 = random_values(num_rows); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; - column_wrapper col1{strings.begin(), strings.end()}; - column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; - - auto expected = table_view{{col0, col1, col2}}; - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("col_other"); - expected_metadata.column_metadata[1].set_name("col_string"); - expected_metadata.column_metadata[2].set_name("col_another"); - - auto filepath = temp_env->get_temp_filepath("Strings.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(is_v2) - .metadata(expected_metadata); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_P(ParquetV2Test, StringsAsBinary) -{ - auto const is_v2 = GetParam(); - std::vector unicode_strings{ - "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; - std::vector ascii_strings{ - "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; - - column_wrapper col0{ascii_strings.begin(), ascii_strings.end()}; - column_wrapper col1{unicode_strings.begin(), unicode_strings.end()}; - column_wrapper col2{ascii_strings.begin(), ascii_strings.end()}; - cudf::test::lists_column_wrapper col3{{'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'M', 'o', 'n', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'u', 'n', 'd', 'a', 'y'}}; - cudf::test::lists_column_wrapper col4{ - {'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 200, 133, 'd', 'n', 200, 133, 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'M', 'o', 'n', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'u', 'n', 'd', 'a', 'y'}}; - - auto write_tbl = table_view{{col0, col1, col2, col3, col4}}; - - cudf::io::table_input_metadata expected_metadata(write_tbl); - expected_metadata.column_metadata[0].set_name("col_single").set_output_as_binary(true); - expected_metadata.column_metadata[1].set_name("col_string").set_output_as_binary(true); - expected_metadata.column_metadata[2].set_name("col_another").set_output_as_binary(true); - expected_metadata.column_metadata[3].set_name("col_binary"); - expected_metadata.column_metadata[4].set_name("col_binary2"); - - auto filepath = temp_env->get_temp_filepath("BinaryStrings.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, write_tbl) - .write_v2_headers(is_v2) - .dictionary_policy(cudf::io::dictionary_policy::NEVER) - .metadata(expected_metadata); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .set_column_schema( - {cudf::io::reader_column_schema().set_convert_binary_to_strings(false), - cudf::io::reader_column_schema().set_convert_binary_to_strings(false), - cudf::io::reader_column_schema().set_convert_binary_to_strings(false), - cudf::io::reader_column_schema().add_child(cudf::io::reader_column_schema()), - cudf::io::reader_column_schema().add_child(cudf::io::reader_column_schema())}); - auto result = cudf::io::read_parquet(in_opts); - auto expected = table_view{{col3, col4, col3, col3, col4}}; - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_P(ParquetV2Test, SlicedTable) -{ - // This test checks for writing zero copy, offsetted views into existing cudf tables - - std::vector strings{ - "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; - auto const num_rows = strings.size(); - auto const is_v2 = GetParam(); - - auto seq_col0 = random_values(num_rows); - auto seq_col2 = random_values(num_rows); - auto validity = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); - - column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; - column_wrapper col1{strings.begin(), strings.end()}; - column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; - - using lcw = cudf::test::lists_column_wrapper; - lcw col3{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}}; - - // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] - // [NULL, [[13],[14,15,16]], NULL] - // [NULL, [], NULL, [[]]] - // NULL - // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] - // [NULL, [[13],[14,15,16]], NULL] - // [[[]]] - // [NULL, [], NULL, [[]]] - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - lcw col4{{ - {{{{1, 2, 3, 4}, valids}}, {{{5, 6, 7}, valids}, {8, 9}}}, - {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, valids}, - {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, valids}, - lcw{lcw{lcw{}}}, - {{{{1, 2, 3, 4}, valids}}, {{{5, 6, 7}, valids}, {8, 9}}}, - {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, valids}, - lcw{lcw{lcw{}}}, - {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, valids}, - }, - valids2}; - - // Struct column - auto ages_col = cudf::test::fixed_width_column_wrapper{ - {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}}; - - auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}}; - - // Struct/List mixed column - - // [] - // [NULL, 2, NULL] - // [4, 5] - // NULL - // [] - // [7, 8, 9] - // [10] - // [11, 12] - lcw land{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}, {10}, {11, 12}}, valids2}; - - // [] - // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] - // [[7, 8], []] - // [[]] - // [[]] - // [[], [], []] - // [[10]] - // [[13, 14], [15]] - lcw flats{lcw{}, - {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, - {{7, 8}, {}}, - lcw{lcw{}}, - lcw{lcw{}}, - lcw{lcw{}, lcw{}, lcw{}}, - {lcw{10}}, - {{13, 14}, {15}}}; - - auto struct_1 = cudf::test::structs_column_wrapper{land, flats}; - auto is_human = cudf::test::fixed_width_column_wrapper{ - {true, true, false, false, true, false, true, false}}; - auto col6 = cudf::test::structs_column_wrapper{{is_human, struct_1}}; - - auto expected = table_view({col0, col1, col2, col3, col4, col5, col6}); - - // auto expected_slice = expected; - auto expected_slice = cudf::slice(expected, {2, static_cast(num_rows) - 1}); - - cudf::io::table_input_metadata expected_metadata(expected_slice); - expected_metadata.column_metadata[0].set_name("col_other"); - expected_metadata.column_metadata[1].set_name("col_string"); - expected_metadata.column_metadata[2].set_name("col_another"); - expected_metadata.column_metadata[3].set_name("col_list"); - expected_metadata.column_metadata[4].set_name("col_multi_level_list"); - expected_metadata.column_metadata[5].set_name("col_struct"); - expected_metadata.column_metadata[5].set_name("col_struct_list"); - expected_metadata.column_metadata[6].child(0).set_name("human?"); - expected_metadata.column_metadata[6].child(1).set_name("particulars"); - expected_metadata.column_metadata[6].child(1).child(0).set_name("land"); - expected_metadata.column_metadata[6].child(1).child(1).set_name("flats"); - - auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) - .write_v2_headers(is_v2) - .metadata(expected_metadata); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_slice, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_P(ParquetV2Test, ListColumn) -{ - auto const is_v2 = GetParam(); - - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - - using lcw = cudf::test::lists_column_wrapper; - - // [NULL, 2, NULL] - // [] - // [4, 5] - // NULL - lcw col0{{{{1, 2, 3}, valids}, {}, {4, 5}, {}}, valids2}; - - // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] - // [[7, 8]] - // [] - // [[]] - lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; - - // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] - // [[7, 8]] - // [] - // [[]] - lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, valids2}, {{7, 8}}, lcw{}, lcw{lcw{}}}; - - // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] - // [[7, 8]] - // [] - // [[]] - using dlcw = cudf::test::lists_column_wrapper; - dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, valids}}, valids2}, - {{7., 8.}}, - dlcw{}, - dlcw{dlcw{}}}; - - // TODO: uint16_t lists are not read properly in parquet reader - // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] - // [[7, 8]] - // [] - // NULL - // using ui16lcw = cudf::test::lists_column_wrapper; - // cudf::test::lists_column_wrapper col4{ - // {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, valids2}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}}, - // valids2}; - - // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] - // [[7, 8]] - // [] - // NULL - lcw col5{ - {{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, valids}}, valids2}, {{7, 8}}, lcw{}, lcw{lcw{}}}, - valids2}; - - using strlcw = cudf::test::lists_column_wrapper; - cudf::test::lists_column_wrapper col6{ - {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}}, - {{"bee", "sting"}}, - strlcw{}, - strlcw{strlcw{}}}; - - // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] - // [NULL, [[13],[14,15,16]], NULL] - // [NULL, [], NULL, [[]]] - // NULL - lcw col7{{ - {{{{1, 2, 3, 4}, valids}}, {{{5, 6, 7}, valids}, {8, 9}}}, - {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, valids}, - {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, valids}, - lcw{lcw{lcw{}}}, - }, - valids2}; - - table_view expected({col0, col1, col2, col3, /* col4, */ col5, col6, col7}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("col_list_int_0"); - expected_metadata.column_metadata[1].set_name("col_list_list_int_1"); - expected_metadata.column_metadata[2].set_name("col_list_list_int_nullable_2"); - expected_metadata.column_metadata[3].set_name("col_list_list_nullable_double_nullable_3"); - // expected_metadata.column_metadata[0].set_name("col_list_list_uint16_4"); - expected_metadata.column_metadata[4].set_name("col_list_nullable_list_nullable_int_nullable_5"); - expected_metadata.column_metadata[5].set_name("col_list_list_string_6"); - expected_metadata.column_metadata[6].set_name("col_list_list_list_7"); - - auto filepath = temp_env->get_temp_filepath("ListColumn.parquet"); - auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(is_v2) - .metadata(expected_metadata) - .compression(cudf::io::compression_type::NONE); - - cudf::io::write_parquet(out_opts); - - auto in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_F(ParquetWriterTest, MultiIndex) -{ - constexpr auto num_rows = 100; - - auto col0_data = random_values(num_rows); - auto col1_data = random_values(num_rows); - auto col2_data = random_values(num_rows); - auto col3_data = random_values(num_rows); - auto col4_data = random_values(num_rows); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - column_wrapper col0{col0_data.begin(), col0_data.end(), validity}; - column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; - column_wrapper col2{col2_data.begin(), col2_data.end(), validity}; - column_wrapper col3{col3_data.begin(), col3_data.end(), validity}; - column_wrapper col4{col4_data.begin(), col4_data.end(), validity}; - - auto expected = table_view{{col0, col1, col2, col3, col4}}; - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("int8s"); - expected_metadata.column_metadata[1].set_name("int16s"); - expected_metadata.column_metadata[2].set_name("int32s"); - expected_metadata.column_metadata[3].set_name("floats"); - expected_metadata.column_metadata[4].set_name("doubles"); - - auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(expected_metadata) - .key_value_metadata( - {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}}); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .use_pandas_metadata(true) - .columns({"int32s", "floats", "doubles"}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_F(ParquetWriterTest, BufferSource) -{ - constexpr auto num_rows = 100 << 10; - auto const seq_col = random_values(num_rows); - auto const validity = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - column_wrapper col{seq_col.begin(), seq_col.end(), validity}; - - auto const expected = table_view{{col}}; - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("col_other"); - - std::vector out_buffer; - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), expected) - .metadata(expected_metadata); - cudf::io::write_parquet(out_opts); - - // host buffer - { - cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info(out_buffer.data(), out_buffer.size())); - auto const result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); - } - - // device buffer - { - auto const d_input = cudf::detail::make_device_uvector_sync( - cudf::host_span{reinterpret_cast(out_buffer.data()), - out_buffer.size()}, - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); - auto const d_buffer = cudf::device_span( - reinterpret_cast(d_input.data()), d_input.size()); - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(d_buffer)); - auto const result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); - } -} - -TEST_F(ParquetWriterTest, ManyFragments) -{ - srand(31337); - auto const expected = create_random_fixed_table(10, 6'000'000, false); - - auto const filepath = temp_env->get_temp_filepath("ManyFragments.parquet"); - cudf::io::parquet_writer_options const args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected) - .max_page_size_bytes(8 * 1024); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options const read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TEST_F(ParquetWriterTest, NonNullable) -{ - srand(31337); - auto expected = create_random_fixed_table(9, 9, false); - - auto filepath = temp_env->get_temp_filepath("NonNullable.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TEST_F(ParquetWriterTest, Struct) -{ - // Struct> - - auto names = {"Samuel Vimes", - "Carrot Ironfoundersson", - "Angua von Uberwald", - "Cheery Littlebottom", - "Detritus", - "Mr Slant"}; - - // `Name` column has all valid values. - auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()}; - - auto ages_col = - cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - - auto struct_1 = cudf::test::structs_column_wrapper{{names_col, ages_col}, {1, 1, 1, 1, 0, 1}}; - - auto is_human_col = cudf::test::fixed_width_column_wrapper{ - {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; - - auto struct_2 = - cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); - - auto expected = table_view({*struct_2}); - - auto filepath = temp_env->get_temp_filepath("Struct.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)); - cudf::io::read_parquet(read_args); -} - -TEST_P(ParquetV2Test, StructOfList) -{ - auto const is_v2 = GetParam(); - - // Struct>, - // flats:List> - // > - // > - - auto weights_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; - - auto ages_col = - cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - - using lcw = cudf::test::lists_column_wrapper; - - // [] - // [NULL, 2, NULL] - // [4, 5] - // NULL - // [] - // [7, 8, 9] - lcw land_unit{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}}, valids2}; - - // [] - // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] - // [[7, 8], []] - // [[]] - // [[]] - // [[], [], []] - lcw flats{lcw{}, - {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, - {{7, 8}, {}}, - lcw{lcw{}}, - lcw{lcw{}}, - lcw{lcw{}, lcw{}, lcw{}}}; - - auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col, land_unit, flats}, - {1, 1, 1, 1, 0, 1}}; - - auto is_human_col = cudf::test::fixed_width_column_wrapper{ - {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; - - auto struct_2 = - cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); - - auto expected = table_view({*struct_2}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("being"); - expected_metadata.column_metadata[0].child(0).set_name("human?"); - expected_metadata.column_metadata[0].child(1).set_name("particulars"); - expected_metadata.column_metadata[0].child(1).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(1).child(1).set_name("age"); - expected_metadata.column_metadata[0].child(1).child(2).set_name("land_unit"); - expected_metadata.column_metadata[0].child(1).child(3).set_name("flats"); - - auto filepath = temp_env->get_temp_filepath("StructOfList.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(is_v2) - .metadata(expected_metadata); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)); - auto const result = cudf::io::read_parquet(read_args); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_P(ParquetV2Test, ListOfStruct) -{ - auto const is_v2 = GetParam(); - - // List - // > - // > - - auto weight_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; - - auto ages_col = - cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - - auto struct_1 = cudf::test::structs_column_wrapper{{weight_col, ages_col}, {1, 1, 1, 1, 0, 1}}; - - auto is_human_col = cudf::test::fixed_width_column_wrapper{ - {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; - - auto struct_2 = - cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); - - auto list_offsets_column = - cudf::test::fixed_width_column_wrapper{0, 2, 5, 5, 6}.release(); - auto num_list_rows = list_offsets_column->size() - 1; - - auto list_col = cudf::make_lists_column( - num_list_rows, std::move(list_offsets_column), std::move(struct_2), 0, {}); - - auto expected = table_view({*list_col}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("family"); - expected_metadata.column_metadata[0].child(1).child(0).set_name("human?"); - expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars"); - expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age"); - - auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(is_v2) - .metadata(expected_metadata); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)); - auto const result = cudf::io::read_parquet(read_args); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -// custom data sink that supports device writes. uses plain file io. -class custom_test_data_sink : public cudf::io::data_sink { - public: - explicit custom_test_data_sink(std::string const& filepath) - { - outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); - CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); - } - - virtual ~custom_test_data_sink() { flush(); } - - void host_write(void const* data, size_t size) override - { - outfile_.write(static_cast(data), size); - } - - [[nodiscard]] bool supports_device_write() const override { return true; } - - void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override - { - this->device_write_async(gpu_data, size, stream).get(); - } - - std::future device_write_async(void const* gpu_data, - size_t size, - rmm::cuda_stream_view stream) override - { - return std::async(std::launch::deferred, [=] { - char* ptr = nullptr; - CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); - CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDefault, stream.value())); - stream.synchronize(); - outfile_.write(ptr, size); - CUDF_CUDA_TRY(cudaFreeHost(ptr)); - }); - } - - void flush() override { outfile_.flush(); } - - size_t bytes_written() override { return outfile_.tellp(); } - - private: - std::ofstream outfile_; -}; - -TEST_F(ParquetWriterTest, CustomDataSink) -{ - auto filepath = temp_env->get_temp_filepath("CustomDataSink.parquet"); - custom_test_data_sink custom_sink(filepath); - - srand(31337); - auto expected = create_random_fixed_table(5, 10, false); - - // write out using the custom sink - { - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - } - - // write out using a memmapped sink - std::vector buf_sink; - { - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buf_sink}, *expected); - cudf::io::write_parquet(args); - } - - // read them back in and make sure everything matches - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); - - cudf::io::parquet_reader_options buf_args = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{buf_sink.data(), buf_sink.size()}); - auto buf_tbl = cudf::io::read_parquet(buf_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(buf_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterTest, DeviceWriteLargeishFile) -{ - auto filepath = temp_env->get_temp_filepath("DeviceWriteLargeishFile.parquet"); - custom_test_data_sink custom_sink(filepath); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_random_fixed_table(4, 4 * 1024 * 1024, false); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterTest, PartitionedWrite) -{ - auto source = create_compressible_fixed_table(16, 4 * 1024 * 1024, 1000, false); - - auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); - auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); - - auto partition1 = cudf::io::partition_info{10, 1024 * 1024}; - auto partition2 = cudf::io::partition_info{20 * 1024 + 7, 3 * 1024 * 1024}; - - auto expected1 = - cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); - auto expected2 = - cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); - - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder( - cudf::io::sink_info(std::vector{filepath1, filepath2}), *source) - .partitions({partition1, partition2}) - .compression(cudf::io::compression_type::NONE); - cudf::io::write_parquet(args); - - auto result1 = cudf::io::read_parquet( - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1))); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); - - auto result2 = cudf::io::read_parquet( - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2))); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); -} - -TEST_P(ParquetV2Test, PartitionedWriteEmptyPartitions) -{ - auto const is_v2 = GetParam(); - - auto source = create_random_fixed_table(4, 4, false); - - auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); - auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); - - auto partition1 = cudf::io::partition_info{1, 0}; - auto partition2 = cudf::io::partition_info{1, 0}; - - auto expected1 = - cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); - auto expected2 = - cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); - - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder( - cudf::io::sink_info(std::vector{filepath1, filepath2}), *source) - .partitions({partition1, partition2}) - .write_v2_headers(is_v2) - .compression(cudf::io::compression_type::NONE); - cudf::io::write_parquet(args); - - auto result1 = cudf::io::read_parquet( - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1))); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); - - auto result2 = cudf::io::read_parquet( - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2))); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); -} - -TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns) -{ - auto const is_v2 = GetParam(); - - auto source = create_random_fixed_table(0, 4, false); - - auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); - auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); - - auto partition1 = cudf::io::partition_info{1, 0}; - auto partition2 = cudf::io::partition_info{1, 0}; - - auto expected1 = - cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); - auto expected2 = - cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); - - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder( - cudf::io::sink_info(std::vector{filepath1, filepath2}), *source) - .partitions({partition1, partition2}) - .write_v2_headers(is_v2) - .compression(cudf::io::compression_type::NONE); - cudf::io::write_parquet(args); - - auto result1 = cudf::io::read_parquet( - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1))); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); - - auto result2 = cudf::io::read_parquet( - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2))); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); -} - -template -std::string create_parquet_file(int num_cols) -{ - srand(31337); - auto const table = create_random_fixed_table(num_cols, 10, true); - auto const filepath = - temp_env->get_temp_filepath(typeid(T).name() + std::to_string(num_cols) + ".parquet"); - cudf::io::parquet_writer_options const out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table->view()); - cudf::io::write_parquet(out_opts); - return filepath; -} - -TEST_F(ParquetWriterTest, MultipleMismatchedSources) -{ - auto const int5file = create_parquet_file(5); - { - auto const float5file = create_parquet_file(5); - std::vector files{int5file, float5file}; - cudf::io::parquet_reader_options const read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{files}); - EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); - } - { - auto const int10file = create_parquet_file(10); - std::vector files{int5file, int10file}; - cudf::io::parquet_reader_options const read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{files}); - EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); - } -} - -TEST_F(ParquetWriterTest, Slice) -{ - auto col = - cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}, {true, true, true, false, true}}; - std::vector indices{2, 5}; - std::vector result = cudf::slice(col, indices); - cudf::table_view tbl{result}; - - auto filepath = temp_env->get_temp_filepath("Slice.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto read_table = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl); -} - -TEST_F(ParquetChunkedWriterTest, SingleTable) -{ - srand(31337); - auto table1 = create_random_fixed_table(5, 5, true); - - auto filepath = temp_env->get_temp_filepath("ChunkedSingle.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(*table1); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1); -} - -TEST_F(ParquetChunkedWriterTest, SimpleTable) -{ - srand(31337); - auto table1 = create_random_fixed_table(5, 5, true); - auto table2 = create_random_fixed_table(5, 5, true); - - auto full_table = cudf::concatenate(std::vector({*table1, *table2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); -} - -TEST_F(ParquetChunkedWriterTest, LargeTables) -{ - srand(31337); - auto table1 = create_random_fixed_table(512, 4096, true); - auto table2 = create_random_fixed_table(512, 8192, true); - - auto full_table = cudf::concatenate(std::vector({*table1, *table2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - auto md = cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2).close(); - ASSERT_EQ(md, nullptr); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); -} - -TEST_F(ParquetChunkedWriterTest, ManyTables) -{ - srand(31337); - std::vector> tables; - std::vector table_views; - constexpr int num_tables = 96; - for (int idx = 0; idx < num_tables; idx++) { - auto tbl = create_random_fixed_table(16, 64, true); - table_views.push_back(*tbl); - tables.push_back(std::move(tbl)); - } - - auto expected = cudf::concatenate(table_views); - - auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer writer(args); - std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) { - writer.write(tbl); - }); - auto md = writer.close({"dummy/path"}); - ASSERT_NE(md, nullptr); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TEST_F(ParquetChunkedWriterTest, Strings) -{ - std::vector> cols; - - bool mask1[] = {true, true, false, true, true, true, true}; - std::vector h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"}; - cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1); - cols.push_back(strings1.release()); - cudf::table tbl1(std::move(cols)); - - bool mask2[] = {false, true, true, true, true, true, true}; - std::vector h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"}; - cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2); - cols.push_back(strings2.release()); - cudf::table tbl2(std::move(cols)); - - auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TEST_F(ParquetChunkedWriterTest, ListColumn) -{ - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - - using lcw = cudf::test::lists_column_wrapper; - - // COL0 (Same nullability) ==================== - // [NULL, 2, NULL] - // [] - // [4, 5] - // NULL - lcw col0_tbl0{{{{1, 2, 3}, valids}, {}, {4, 5}, {}}, valids2}; - - // [7, 8, 9] - // [] - // [NULL, 11] - // NULL - lcw col0_tbl1{{{7, 8, 9}, {}, {{10, 11}, valids}, {}}, valids2}; - - // COL1 (Nullability different in different chunks, test of merging nullability in writer) - // [NULL, 2, NULL] - // [] - // [4, 5] - // [] - lcw col1_tbl0{{{1, 2, 3}, valids}, {}, {4, 5}, {}}; - - // [7, 8, 9] - // [] - // [10, 11] - // NULL - lcw col1_tbl1{{{7, 8, 9}, {}, {10, 11}, {}}, valids2}; - - // COL2 (non-nested columns to test proper schema construction) - size_t num_rows_tbl0 = static_cast(col0_tbl0).size(); - size_t num_rows_tbl1 = static_cast(col0_tbl1).size(); - auto seq_col0 = random_values(num_rows_tbl0); - auto seq_col1 = random_values(num_rows_tbl1); - - column_wrapper col2_tbl0{seq_col0.begin(), seq_col0.end(), valids}; - column_wrapper col2_tbl1{seq_col1.begin(), seq_col1.end(), valids2}; - - auto tbl0 = table_view({col0_tbl0, col1_tbl0, col2_tbl0}); - auto tbl1 = table_view({col0_tbl1, col1_tbl1, col2_tbl1}); - - auto expected = cudf::concatenate(std::vector({tbl0, tbl1})); - - auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(tbl0).write(tbl1); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TEST_F(ParquetChunkedWriterTest, ListOfStruct) -{ - // Table 1 - auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; - auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; - auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1}; - auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false}}; - auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; - - auto list_offsets_column_1 = - cudf::test::fixed_width_column_wrapper{0, 2, 3, 3}.release(); - auto num_list_rows_1 = list_offsets_column_1->size() - 1; - - auto list_col_1 = cudf::make_lists_column( - num_list_rows_1, std::move(list_offsets_column_1), struct_2_1.release(), 0, {}); - - auto table_1 = table_view({*list_col_1}); - - // Table 2 - auto weight_2 = cudf::test::fixed_width_column_wrapper{{1.1, -1.0, -1.0}}; - auto ages_2 = cudf::test::fixed_width_column_wrapper{{31, 351, 351}, {1, 1, 0}}; - auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}}; - auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false, false}, {1, 1, 0}}; - auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; - - auto list_offsets_column_2 = - cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}.release(); - auto num_list_rows_2 = list_offsets_column_2->size() - 1; - - auto list_col_2 = cudf::make_lists_column( - num_list_rows_2, std::move(list_offsets_column_2), struct_2_2.release(), 0, {}); - - auto table_2 = table_view({*list_col_2}); - - auto full_table = cudf::concatenate(std::vector({table_1, table_2})); - - cudf::io::table_input_metadata expected_metadata(table_1); - expected_metadata.column_metadata[0].set_name("family"); - expected_metadata.column_metadata[0].child(1).set_nullability(false); - expected_metadata.column_metadata[0].child(1).child(0).set_name("human?"); - expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars"); - expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age"); - - auto filepath = temp_env->get_temp_filepath("ChunkedListOfStruct.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - args.set_metadata(expected_metadata); - cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) -{ - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - - using lcw = cudf::test::lists_column_wrapper; - - // Table 1 =========================== - - // [] - // [NULL, 2, NULL] - // [4, 5] - // NULL - lcw land_1{{{}, {{1, 2, 3}, valids}, {4, 5}, {}}, valids2}; - - // [] - // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] - // [[7, 8], []] - // [[]] - lcw flats_1{lcw{}, {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}, {}}, lcw{lcw{}}}; - - auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3, 1.1}}; - auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5, 31}}; - auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1, land_1, flats_1}; - auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false, false}}; - auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; - - auto list_offsets_column_1 = - cudf::test::fixed_width_column_wrapper{0, 2, 3, 4}.release(); - auto num_list_rows_1 = list_offsets_column_1->size() - 1; - - auto list_col_1 = cudf::make_lists_column( - num_list_rows_1, std::move(list_offsets_column_1), struct_2_1.release(), 0, {}); - - auto table_1 = table_view({*list_col_1}); - - // Table 2 =========================== - - // [] - // [7, 8, 9] - lcw land_2{{}, {7, 8, 9}}; - - // [[]] - // [[], [], []] - lcw flats_2{lcw{lcw{}}, lcw{lcw{}, lcw{}, lcw{}}}; - - auto weight_2 = cudf::test::fixed_width_column_wrapper{{-1.0, -1.0}}; - auto ages_2 = cudf::test::fixed_width_column_wrapper{{351, 351}, {1, 0}}; - auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2, land_2, flats_2}, {0, 1}}; - auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false}, {1, 0}}; - auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; - - auto list_offsets_column_2 = - cudf::test::fixed_width_column_wrapper{0, 1, 2}.release(); - auto num_list_rows_2 = list_offsets_column_2->size() - 1; - - auto list_col_2 = cudf::make_lists_column( - num_list_rows_2, std::move(list_offsets_column_2), struct_2_2.release(), 0, {}); - - auto table_2 = table_view({*list_col_2}); - - auto full_table = cudf::concatenate(std::vector({table_1, table_2})); - - cudf::io::table_input_metadata expected_metadata(table_1); - expected_metadata.column_metadata[0].set_name("family"); - expected_metadata.column_metadata[0].child(1).set_nullability(false); - expected_metadata.column_metadata[0].child(1).child(0).set_name("human?"); - expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars"); - expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age"); - expected_metadata.column_metadata[0].child(1).child(1).child(2).set_name("land_unit"); - expected_metadata.column_metadata[0].child(1).child(1).child(3).set_name("flats"); - - auto filepath = temp_env->get_temp_filepath("ListOfStructOfStructOfListOfList.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - args.set_metadata(expected_metadata); - cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); - - // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls. - auto result_parent_list = result.tbl->get_column(0); - auto result_struct_2 = result_parent_list.child(cudf::lists_column_view::child_column_index); - EXPECT_EQ(result_struct_2.nullable(), false); -} - -TEST_F(ParquetChunkedWriterTest, MismatchedTypes) -{ - srand(31337); - auto table1 = create_random_fixed_table(4, 4, true); - auto table2 = create_random_fixed_table(4, 4, true); - - auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer writer(args); - writer.write(*table1); - EXPECT_THROW(writer.write(*table2), cudf::logic_error); - writer.close(); -} - -TEST_F(ParquetChunkedWriterTest, ChunkedWriteAfterClosing) -{ - srand(31337); - auto table = create_random_fixed_table(4, 4, true); - - auto filepath = temp_env->get_temp_filepath("ChunkedWriteAfterClosing.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer writer(args); - writer.write(*table).close(); - EXPECT_THROW(writer.write(*table), cudf::logic_error); -} - -TEST_F(ParquetChunkedWriterTest, ReadingUnclosedFile) -{ - srand(31337); - auto table = create_random_fixed_table(4, 4, true); - - auto filepath = temp_env->get_temp_filepath("ReadingUnclosedFile.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer writer(args); - writer.write(*table); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); -} - -TEST_F(ParquetChunkedWriterTest, MismatchedStructure) -{ - srand(31337); - auto table1 = create_random_fixed_table(4, 4, true); - auto table2 = create_random_fixed_table(3, 4, true); - - auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer writer(args); - writer.write(*table1); - EXPECT_THROW(writer.write(*table2), cudf::logic_error); - writer.close(); -} - -TEST_F(ParquetChunkedWriterTest, MismatchedStructureList) -{ - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - - using lcw = cudf::test::lists_column_wrapper; - - // COL0 (mismatched depth) ==================== - // [NULL, 2, NULL] - // [] - // [4, 5] - // NULL - lcw col00{{{{1, 2, 3}, valids}, {}, {4, 5}, {}}, valids2}; - - // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] - // [[7, 8]] - // [] - // [[]] - lcw col01{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; - - // COL2 (non-nested columns to test proper schema construction) - size_t num_rows = static_cast(col00).size(); - auto seq_col0 = random_values(num_rows); - auto seq_col1 = random_values(num_rows); - - column_wrapper col10{seq_col0.begin(), seq_col0.end(), valids}; - column_wrapper col11{seq_col1.begin(), seq_col1.end(), valids2}; - - auto tbl0 = table_view({col00, col10}); - auto tbl1 = table_view({col01, col11}); - - auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer writer(args); - writer.write(tbl0); - EXPECT_THROW(writer.write(tbl1), cudf::logic_error); -} - -TEST_F(ParquetChunkedWriterTest, DifferentNullability) -{ - srand(31337); - auto table1 = create_random_fixed_table(5, 5, true); - auto table2 = create_random_fixed_table(5, 5, false); - - auto full_table = cudf::concatenate(std::vector({*table1, *table2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); -} - -TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct) -{ - // Struct, - // age:int - // > (nullable) - // > (non-nullable) - - // Table 1: is_human and struct_1 are non-nullable but should be nullable when read back. - auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; - auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; - auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1}; - auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false}}; - auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; - auto table_1 = cudf::table_view({struct_2_1}); - - // Table 2: struct_1 and is_human are nullable now so if we hadn't assumed worst case (nullable) - // when writing table_1, we would have wrong pages for it. - auto weight_2 = cudf::test::fixed_width_column_wrapper{{1.1, -1.0, -1.0}}; - auto ages_2 = cudf::test::fixed_width_column_wrapper{{31, 351, 351}, {1, 1, 0}}; - auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}}; - auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false, false}, {1, 1, 0}}; - auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; - auto table_2 = cudf::table_view({struct_2_2}); - - auto full_table = cudf::concatenate(std::vector({table_1, table_2})); - - cudf::io::table_input_metadata expected_metadata(table_1); - expected_metadata.column_metadata[0].set_name("being"); - expected_metadata.column_metadata[0].child(0).set_name("human?"); - expected_metadata.column_metadata[0].child(1).set_name("particulars"); - expected_metadata.column_metadata[0].child(1).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(1).child(1).set_name("age"); - - auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - args.set_metadata(expected_metadata); - cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_F(ParquetChunkedWriterTest, ForcedNullability) -{ - srand(31337); - auto table1 = create_random_fixed_table(5, 5, false); - auto table2 = create_random_fixed_table(5, 5, false); - - auto full_table = cudf::concatenate(std::vector({*table1, *table2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet"); - - cudf::io::table_input_metadata metadata(*table1); - - // In the absence of prescribed per-column nullability in metadata, the writer assumes the worst - // and considers all columns nullable. However cudf::concatenate will not force nulls in case no - // columns are nullable. To get the expected result, we tell the writer the nullability of all - // columns in advance. - for (auto& col_meta : metadata.column_metadata) { - col_meta.set_nullability(false); - } - - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}) - .metadata(std::move(metadata)); - cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); -} - -TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList) -{ - srand(31337); - - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); - auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); - - using lcw = cudf::test::lists_column_wrapper; - - // COL0 ==================== - // [1, 2, 3] - // [] - // [4, 5] - // NULL - lcw col00{{{1, 2, 3}, {}, {4, 5}, {}}, valids2}; - - // [7] - // [] - // [8, 9, 10, 11] - // NULL - lcw col01{{{7}, {}, {8, 9, 10, 11}, {}}, valids2}; - - // COL1 (non-nested columns to test proper schema construction) - size_t num_rows = static_cast(col00).size(); - auto seq_col0 = random_values(num_rows); - auto seq_col1 = random_values(num_rows); - - column_wrapper col10{seq_col0.begin(), seq_col0.end(), valids}; - column_wrapper col11{seq_col1.begin(), seq_col1.end(), valids2}; - - auto table1 = table_view({col00, col10}); - auto table2 = table_view({col01, col11}); - - auto full_table = cudf::concatenate(std::vector({table1, table2})); - - cudf::io::table_input_metadata metadata(table1); - metadata.column_metadata[0].set_nullability(true); // List is nullable at first (root) level - metadata.column_metadata[0].child(1).set_nullability( - false); // non-nullable at second (leaf) level - metadata.column_metadata[1].set_nullability(true); - - auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet"); - - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}) - .metadata(std::move(metadata)); - cudf::io::parquet_chunked_writer(args).write(table1).write(table2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); -} - -TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct) -{ - // Struct, - // age:int - // > (nullable) - // > (non-nullable) - - // Table 1: is_human and struct_2 are non-nullable and should stay that way when read back. - auto weight_1 = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; - auto ages_1 = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; - auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1}; - auto is_human_1 = cudf::test::fixed_width_column_wrapper{{true, true, false}}; - auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}}; - auto table_1 = cudf::table_view({struct_2_1}); - - auto weight_2 = cudf::test::fixed_width_column_wrapper{{1.1, -1.0, -1.0}}; - auto ages_2 = cudf::test::fixed_width_column_wrapper{{31, 351, 351}, {1, 1, 0}}; - auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}}; - auto is_human_2 = cudf::test::fixed_width_column_wrapper{{false, false, false}}; - auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}}; - auto table_2 = cudf::table_view({struct_2_2}); - - auto full_table = cudf::concatenate(std::vector({table_1, table_2})); - - cudf::io::table_input_metadata expected_metadata(table_1); - expected_metadata.column_metadata[0].set_name("being").set_nullability(false); - expected_metadata.column_metadata[0].child(0).set_name("human?").set_nullability(false); - expected_metadata.column_metadata[0].child(1).set_name("particulars"); - expected_metadata.column_metadata[0].child(1).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(1).child(1).set_name("age"); - - auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - args.set_metadata(expected_metadata); - cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); -} - -TEST_F(ParquetChunkedWriterTest, ReadRowGroups) -{ - srand(31337); - auto table1 = create_random_fixed_table(5, 5, true); - auto table2 = create_random_fixed_table(5, 5, true); - - auto full_table = cudf::concatenate(std::vector({*table2, *table1, *table2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - { - cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2); - } - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .row_groups({{1, 0, 1}}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table); -} - -TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError) -{ - srand(31337); - auto table1 = create_random_fixed_table(5, 5, true); - - auto filepath = temp_env->get_temp_filepath("ChunkedRowGroupsError.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(*table1); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).row_groups({{0, 1}}); - EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); - read_opts.set_row_groups({{-1}}); - EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); - read_opts.set_row_groups({{0}, {0}}); - EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); -} - -TEST_F(ParquetWriterTest, DecimalWrite) -{ - constexpr cudf::size_type num_rows = 500; - auto seq_col0 = random_values(num_rows); - auto seq_col1 = random_values(num_rows); - - auto valids = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - - auto col0 = cudf::test::fixed_point_column_wrapper{ - seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; - auto col1 = cudf::test::fixed_point_column_wrapper{ - seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; - - auto table = table_view({col0, col1}); - - auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table); - - cudf::io::table_input_metadata expected_metadata(table); - - // verify failure if too small a precision is given - expected_metadata.column_metadata[0].set_decimal_precision(7); - expected_metadata.column_metadata[1].set_decimal_precision(1); - args.set_metadata(expected_metadata); - EXPECT_THROW(cudf::io::write_parquet(args), cudf::logic_error); - - // verify success if equal precision is given - expected_metadata.column_metadata[0].set_decimal_precision(7); - expected_metadata.column_metadata[1].set_decimal_precision(9); - args.set_metadata(std::move(expected_metadata)); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table); -} - -TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize) -{ - // write out two 31 row tables and make sure they get - // read back with all their validity bits in the right place - - using T = TypeParam; - - int num_els = 31; - std::vector> cols; - - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, true}; - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); - cols.push_back(c1a_w.release()); - cols.push_back(c1b_w.release()); - cudf::table tbl1(std::move(cols)); - - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); - cols.push_back(c2a_w.release()); - cols.push_back(c2b_w.release()); - cudf::table tbl2(std::move(cols)); - - auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2) -{ - // write out two 33 row tables and make sure they get - // read back with all their validity bits in the right place - - using T = TypeParam; - - int num_els = 33; - std::vector> cols; - - bool mask[] = {false, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, true, true, true}; - - T c1a[num_els]; - std::fill(c1a, c1a + num_els, static_cast(5)); - T c1b[num_els]; - std::fill(c1b, c1b + num_els, static_cast(6)); - column_wrapper c1a_w(c1a, c1a + num_els, mask); - column_wrapper c1b_w(c1b, c1b + num_els, mask); - cols.push_back(c1a_w.release()); - cols.push_back(c1b_w.release()); - cudf::table tbl1(std::move(cols)); - - T c2a[num_els]; - std::fill(c2a, c2a + num_els, static_cast(8)); - T c2b[num_els]; - std::fill(c2b, c2b + num_els, static_cast(9)); - column_wrapper c2a_w(c2a, c2a + num_els, mask); - column_wrapper c2b_w(c2b, c2b + num_els, mask); - cols.push_back(c2a_w.release()); - cols.push_back(c2b_w.release()); - cudf::table tbl2(std::move(cols)); - - auto expected = cudf::concatenate(std::vector({tbl1, tbl2})); - - auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet"); - cudf::io::chunked_parquet_writer_options args = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); - cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -// custom mem mapped data sink that supports device writes -template -class custom_test_memmap_sink : public cudf::io::data_sink { - public: - explicit custom_test_memmap_sink(std::vector* mm_writer_buf) - { - mm_writer = cudf::io::data_sink::create(mm_writer_buf); - } - - virtual ~custom_test_memmap_sink() { mm_writer->flush(); } - - void host_write(void const* data, size_t size) override { mm_writer->host_write(data, size); } - - [[nodiscard]] bool supports_device_write() const override { return supports_device_writes; } - - void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override - { - this->device_write_async(gpu_data, size, stream).get(); - } - - std::future device_write_async(void const* gpu_data, - size_t size, - rmm::cuda_stream_view stream) override - { - return std::async(std::launch::deferred, [=] { - char* ptr = nullptr; - CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); - CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDefault, stream.value())); - stream.synchronize(); - mm_writer->host_write(ptr, size); - CUDF_CUDA_TRY(cudaFreeHost(ptr)); - }); - } - - void flush() override { mm_writer->flush(); } - - size_t bytes_written() override { return mm_writer->bytes_written(); } - - private: - std::unique_ptr mm_writer; -}; - -TEST_F(ParquetWriterStressTest, LargeTableWeakCompression) -{ - std::vector mm_buf; - mm_buf.reserve(4 * 1024 * 1024 * 16); - custom_test_memmap_sink custom_sink(&mm_buf); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_random_fixed_table(16, 4 * 1024 * 1024, false); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterStressTest, LargeTableGoodCompression) -{ - std::vector mm_buf; - mm_buf.reserve(4 * 1024 * 1024 * 16); - custom_test_memmap_sink custom_sink(&mm_buf); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 128 * 1024, false); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterStressTest, LargeTableWithValids) -{ - std::vector mm_buf; - mm_buf.reserve(4 * 1024 * 1024 * 16); - custom_test_memmap_sink custom_sink(&mm_buf); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 6, true); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableWeakCompression) -{ - std::vector mm_buf; - mm_buf.reserve(4 * 1024 * 1024 * 16); - custom_test_memmap_sink custom_sink(&mm_buf); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_random_fixed_table(16, 4 * 1024 * 1024, false); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableGoodCompression) -{ - std::vector mm_buf; - mm_buf.reserve(4 * 1024 * 1024 * 16); - custom_test_memmap_sink custom_sink(&mm_buf); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 128 * 1024, false); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableWithValids) -{ - std::vector mm_buf; - mm_buf.reserve(4 * 1024 * 1024 * 16); - custom_test_memmap_sink custom_sink(&mm_buf); - - // exercises multiple rowgroups - srand(31337); - auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 6, true); - - // write out using the custom sink (which uses device writes) - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options custom_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); - auto custom_tbl = cudf::io::read_parquet(custom_args); - CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); -} - -TEST_F(ParquetReaderTest, UserBounds) -{ - // trying to read more rows than there are should result in - // receiving the properly capped # of rows - { - srand(31337); - auto expected = create_random_fixed_table(4, 4, false); - - auto filepath = temp_env->get_temp_filepath("TooManyRows.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); - cudf::io::write_parquet(args); - - // attempt to read more rows than there actually are - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).num_rows(16); - auto result = cudf::io::read_parquet(read_opts); - - // we should only get back 4 rows - EXPECT_EQ(result.tbl->view().column(0).size(), 4); - } - - // trying to read past the end of the # of actual rows should result - // in empty columns. - { - srand(31337); - auto expected = create_random_fixed_table(4, 4, false); - - auto filepath = temp_env->get_temp_filepath("PastBounds.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); - cudf::io::write_parquet(args); - - // attempt to read more rows than there actually are - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).skip_rows(4); - auto result = cudf::io::read_parquet(read_opts); - - // we should get empty columns back - EXPECT_EQ(result.tbl->view().num_columns(), 4); - EXPECT_EQ(result.tbl->view().column(0).size(), 0); - } - - // trying to read 0 rows should result in empty columns - { - srand(31337); - auto expected = create_random_fixed_table(4, 4, false); - - auto filepath = temp_env->get_temp_filepath("ZeroRows.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); - cudf::io::write_parquet(args); - - // attempt to read more rows than there actually are - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).num_rows(0); - auto result = cudf::io::read_parquet(read_opts); - - EXPECT_EQ(result.tbl->view().num_columns(), 4); - EXPECT_EQ(result.tbl->view().column(0).size(), 0); - } - - // trying to read 0 rows past the end of the # of actual rows should result - // in empty columns. - { - srand(31337); - auto expected = create_random_fixed_table(4, 4, false); - - auto filepath = temp_env->get_temp_filepath("ZeroRowsPastBounds.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); - cudf::io::write_parquet(args); - - // attempt to read more rows than there actually are - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .skip_rows(4) - .num_rows(0); - auto result = cudf::io::read_parquet(read_opts); - - // we should get empty columns back - EXPECT_EQ(result.tbl->view().num_columns(), 4); - EXPECT_EQ(result.tbl->view().column(0).size(), 0); - } -} - -TEST_F(ParquetReaderTest, UserBoundsWithNulls) -{ - // clang-format off - cudf::test::fixed_width_column_wrapper col{{1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7, 8,8,8,8,8,8,8,8} - ,{1,1,1,0,0,0,1,1, 1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,0,0, 1,0,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0}}; - // clang-format on - cudf::table_view tbl({col}); - auto filepath = temp_env->get_temp_filepath("UserBoundsWithNulls.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); - cudf::io::write_parquet(out_args); - - // skip_rows / num_rows - // clang-format off - std::vector> params{ {-1, -1}, {1, 3}, {3, -1}, - {31, -1}, {32, -1}, {33, -1}, - {31, 5}, {32, 5}, {33, 5}, - {-1, 7}, {-1, 31}, {-1, 32}, {-1, 33}, - {62, -1}, {63, -1}, - {62, 2}, {63, 1}}; - // clang-format on - for (auto p : params) { - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - if (p.first >= 0) { read_args.set_skip_rows(p.first); } - if (p.second >= 0) { read_args.set_num_rows(p.second); } - auto result = cudf::io::read_parquet(read_args); - - p.first = p.first < 0 ? 0 : p.first; - p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; - std::vector slice_indices{p.first, p.first + p.second}; - auto expected = cudf::slice(col, slice_indices); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); - } -} - -TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) -{ - constexpr int num_rows = 32 * 1024; - - std::mt19937 gen(6542); - std::bernoulli_distribution bn(0.7f); - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); - auto values = thrust::make_counting_iterator(0); - - // int64 - cudf::test::fixed_width_column_wrapper c0(values, values + num_rows, valids); - - // list - constexpr int floats_per_row = 4; - auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; }); - cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, - c1_offset_iter + num_rows + 1); - cudf::test::fixed_width_column_wrapper c1_floats( - values, values + (num_rows * floats_per_row), valids); - auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); - - auto _c1 = cudf::make_lists_column( - num_rows, c1_offsets.release(), c1_floats.release(), null_count, std::move(null_mask)); - auto c1 = cudf::purge_nonempty_nulls(*_c1); - - // list> - auto c2 = make_parquet_list_list_col(0, num_rows, 5, 8, true); - - // struct, int, float> - std::vector strings{ - "abc", "x", "bananas", "gpu", "minty", "backspace", "", "cayenne", "turbine", "soft"}; - std::uniform_int_distribution uni(0, strings.size() - 1); - auto string_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](cudf::size_type idx) { return strings[uni(gen)]; }); - constexpr int string_per_row = 3; - constexpr int num_string_rows = num_rows * string_per_row; - cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows}; - auto offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; }); - cudf::test::fixed_width_column_wrapper offsets(offset_iter, - offset_iter + num_rows + 1); - - auto _c3_valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; }); - std::vector c3_valids(num_rows); - std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin()); - std::tie(null_mask, null_count) = cudf::test::detail::make_null_mask(valids, valids + num_rows); - auto _c3_list = cudf::make_lists_column( - num_rows, offsets.release(), string_col.release(), null_count, std::move(null_mask)); - auto c3_list = cudf::purge_nonempty_nulls(*_c3_list); - cudf::test::fixed_width_column_wrapper c3_ints(values, values + num_rows, valids); - cudf::test::fixed_width_column_wrapper c3_floats(values, values + num_rows, valids); - std::vector> c3_children; - c3_children.push_back(std::move(c3_list)); - c3_children.push_back(c3_ints.release()); - c3_children.push_back(c3_floats.release()); - cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids); - auto c3 = cudf::purge_nonempty_nulls(_c3); - - // write it out - cudf::table_view tbl({c0, *c1, *c2, *c3}); - auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsMixedTypes.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); - cudf::io::write_parquet(out_args); - - // read it back - std::vector> params{ - {-1, -1}, {0, num_rows}, {1, num_rows - 1}, {num_rows - 1, 1}, {517, 22000}}; - for (auto p : params) { - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - if (p.first >= 0) { read_args.set_skip_rows(p.first); } - if (p.second >= 0) { read_args.set_num_rows(p.second); } - auto result = cudf::io::read_parquet(read_args); - - p.first = p.first < 0 ? 0 : p.first; - p.second = p.second < 0 ? num_rows - p.first : p.second; - std::vector slice_indices{p.first, p.first + p.second}; - auto expected = cudf::slice(tbl, slice_indices); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected[0]); - } -} - -TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge) -{ - constexpr int num_rows = 30 * 1000000; - - std::mt19937 gen(6747); - std::bernoulli_distribution bn(0.7f); - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); - auto values = thrust::make_counting_iterator(0); - - cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); - - // this file will have row groups of 1,000,000 each - cudf::table_view tbl({col}); - auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); - cudf::io::write_parquet(out_args); - - // skip_rows / num_rows - // clang-format off - std::vector> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1}, - {31, 1}, {32, 1}, {33, 1}, - // deliberately span some row group boundaries - {999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1}, - {16785678, 3}, {22996176, 31}, - {24001231, 17}, {29000001, 989999}, {29999999, 1} }; - // clang-format on - for (auto p : params) { - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - if (p.first >= 0) { read_args.set_skip_rows(p.first); } - if (p.second >= 0) { read_args.set_num_rows(p.second); } - auto result = cudf::io::read_parquet(read_args); - - p.first = p.first < 0 ? 0 : p.first; - p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; - std::vector slice_indices{p.first, p.first + p.second}; - auto expected = cudf::slice(col, slice_indices); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); - } -} - -TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge) -{ - constexpr int num_rows = 5 * 1000000; - auto colp = make_parquet_list_list_col(0, num_rows, 5, 8, true); - cudf::column_view col = *colp; - - // this file will have row groups of 1,000,000 each - cudf::table_view tbl({col}); - auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); - cudf::io::write_parquet(out_args); - - // skip_rows / num_rows - // clang-format off - std::vector> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1}, - {31, 1}, {32, 1}, {33, 1}, - // deliberately span some row group boundaries - {999000, 1001}, {999000, 2000}, {2999999, 2}, - {1678567, 3}, {4299676, 31}, - {4001231, 17}, {1900000, 989999}, {4999999, 1} }; - // clang-format on - for (auto p : params) { - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - if (p.first >= 0) { read_args.set_skip_rows(p.first); } - if (p.second >= 0) { read_args.set_num_rows(p.second); } - auto result = cudf::io::read_parquet(read_args); - - p.first = p.first < 0 ? 0 : p.first; - p.second = p.second < 0 ? static_cast(col).size() - p.first : p.second; - std::vector slice_indices{p.first, p.first + p.second}; - auto expected = cudf::slice(col, slice_indices); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]); - } -} - -TEST_F(ParquetReaderTest, ReorderedColumns) -{ - { - auto a = cudf::test::strings_column_wrapper{{"a", "", "c"}, {true, false, true}}; - auto b = cudf::test::fixed_width_column_wrapper{1, 2, 3}; - - cudf::table_view tbl{{a, b}}; - auto filepath = temp_env->get_temp_filepath("ReorderedColumns.parquet"); - cudf::io::table_input_metadata md(tbl); - md.column_metadata[0].set_name("a"); - md.column_metadata[1].set_name("b"); - cudf::io::parquet_writer_options opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(md); - cudf::io::write_parquet(opts); - - // read them out of order - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .columns({"b", "a"}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), b); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a); - } - - { - auto a = cudf::test::fixed_width_column_wrapper{1, 2, 3}; - auto b = cudf::test::strings_column_wrapper{{"a", "", "c"}, {true, false, true}}; - - cudf::table_view tbl{{a, b}}; - auto filepath = temp_env->get_temp_filepath("ReorderedColumns2.parquet"); - cudf::io::table_input_metadata md(tbl); - md.column_metadata[0].set_name("a"); - md.column_metadata[1].set_name("b"); - cudf::io::parquet_writer_options opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(md); - cudf::io::write_parquet(opts); - - // read them out of order - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .columns({"b", "a"}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), b); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a); - } - - auto a = cudf::test::fixed_width_column_wrapper{1, 2, 3, 10, 20, 30}; - auto b = cudf::test::strings_column_wrapper{{"a", "", "c", "cats", "dogs", "owls"}, - {true, false, true, true, false, true}}; - auto c = cudf::test::fixed_width_column_wrapper{{15, 16, 17, 25, 26, 32}, - {false, true, true, true, true, false}}; - auto d = cudf::test::strings_column_wrapper{"ducks", "sheep", "cows", "fish", "birds", "ants"}; - - cudf::table_view tbl{{a, b, c, d}}; - auto filepath = temp_env->get_temp_filepath("ReorderedColumns3.parquet"); - cudf::io::table_input_metadata md(tbl); - md.column_metadata[0].set_name("a"); - md.column_metadata[1].set_name("b"); - md.column_metadata[2].set_name("c"); - md.column_metadata[3].set_name("d"); - cudf::io::parquet_writer_options opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl) - .metadata(std::move(md)); - cudf::io::write_parquet(opts); - - { - // read them out of order - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .columns({"d", "a", "b", "c"}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), d); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), b); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(3), c); - } - - { - // read them out of order - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .columns({"c", "d", "a", "b"}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), c); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), d); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), a); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(3), b); - } - - { - // read them out of order - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .columns({"d", "c", "b", "a"}); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), d); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), c); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), b); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(3), a); - } -} - -TEST_F(ParquetReaderTest, SelectNestedColumn) -{ - // Struct>, - // flats:List> - // > - // > - - auto weights_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; - - auto ages_col = - cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - - auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col}, {1, 1, 1, 1, 0, 1}}; - - auto is_human_col = cudf::test::fixed_width_column_wrapper{ - {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; - - auto struct_2 = - cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); - - auto input = table_view({*struct_2}); - - cudf::io::table_input_metadata input_metadata(input); - input_metadata.column_metadata[0].set_name("being"); - input_metadata.column_metadata[0].child(0).set_name("human?"); - input_metadata.column_metadata[0].child(1).set_name("particulars"); - input_metadata.column_metadata[0].child(1).child(0).set_name("weight"); - input_metadata.column_metadata[0].child(1).child(1).set_name("age"); - - auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, input) - .metadata(std::move(input_metadata)); - cudf::io::write_parquet(args); - - { // Test selecting a single leaf from the table - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)) - .columns({"being.particulars.age"}); - auto const result = cudf::io::read_parquet(read_args); - - auto expect_ages_col = cudf::test::fixed_width_column_wrapper{ - {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - auto expect_s_1 = cudf::test::structs_column_wrapper{{expect_ages_col}, {1, 1, 1, 1, 0, 1}}; - auto expect_s_2 = - cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release(); - auto expected = table_view({*expect_s_2}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("being"); - expected_metadata.column_metadata[0].child(0).set_name("particulars"); - expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); - } - - { // Test selecting a non-leaf and expecting all hierarchy from that node onwards - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)) - .columns({"being.particulars"}); - auto const result = cudf::io::read_parquet(read_args); - - auto expected_weights_col = - cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; - - auto expected_ages_col = cudf::test::fixed_width_column_wrapper{ - {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - - auto expected_s_1 = cudf::test::structs_column_wrapper{ - {expected_weights_col, expected_ages_col}, {1, 1, 1, 1, 0, 1}}; - - auto expect_s_2 = - cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release(); - auto expected = table_view({*expect_s_2}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("being"); - expected_metadata.column_metadata[0].child(0).set_name("particulars"); - expected_metadata.column_metadata[0].child(0).child(0).set_name("weight"); - expected_metadata.column_metadata[0].child(0).child(1).set_name("age"); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); - } - - { // Test selecting struct children out of order - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)) - .columns({"being.particulars.age", "being.particulars.weight", "being.human?"}); - auto const result = cudf::io::read_parquet(read_args); - - auto expected_weights_col = - cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; - - auto expected_ages_col = cudf::test::fixed_width_column_wrapper{ - {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; - - auto expected_is_human_col = cudf::test::fixed_width_column_wrapper{ - {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; - - auto expect_s_1 = cudf::test::structs_column_wrapper{{expected_ages_col, expected_weights_col}, - {1, 1, 1, 1, 0, 1}}; - - auto expect_s_2 = - cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col}, {0, 1, 1, 1, 1, 1}} - .release(); - - auto expected = table_view({*expect_s_2}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("being"); - expected_metadata.column_metadata[0].child(0).set_name("particulars"); - expected_metadata.column_metadata[0].child(0).child(0).set_name("age"); - expected_metadata.column_metadata[0].child(0).child(1).set_name("weight"); - expected_metadata.column_metadata[0].child(1).set_name("human?"); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - cudf::test::expect_metadata_equal(expected_metadata, result.metadata); - } -} - -TEST_F(ParquetReaderTest, DecimalRead) -{ - { - /* We could add a dataset to include this file, but we don't want tests in cudf to have data. - This test is a temporary test until python gains the ability to write decimal, so we're - embedding - a parquet file directly into the code here to prevent issues with finding the file */ - unsigned char const decimals_parquet[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, - 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, - 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, - 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x68, 0x12, 0xdc, 0xff, 0xbd, 0x18, 0xfd, 0xff, 0x64, - 0x13, 0x80, 0x00, 0xb3, 0x5d, 0x62, 0x00, 0x90, 0x35, 0xa9, 0xff, 0xa2, 0xde, 0xe3, 0xff, - 0xe9, 0xbf, 0x96, 0xff, 0x1f, 0x8a, 0x98, 0xff, 0xb1, 0x50, 0x34, 0x00, 0x88, 0x24, 0x59, - 0x00, 0x2a, 0x33, 0xbe, 0xff, 0xd5, 0x16, 0xbc, 0xff, 0x13, 0x50, 0x8d, 0xff, 0xcb, 0x63, - 0x2d, 0x00, 0x80, 0x8f, 0xbe, 0xff, 0x82, 0x40, 0x10, 0x00, 0x84, 0x68, 0x70, 0xff, 0x9b, - 0x69, 0x78, 0x00, 0x14, 0x6c, 0x10, 0x00, 0x50, 0xd9, 0xe1, 0xff, 0xaa, 0xcd, 0x6a, 0x00, - 0xcf, 0xb1, 0x28, 0x00, 0x77, 0x57, 0x8d, 0x00, 0xee, 0x05, 0x79, 0x00, 0xf0, 0x15, 0xeb, - 0xff, 0x02, 0xe2, 0x06, 0x00, 0x87, 0x43, 0x86, 0x00, 0xf8, 0x2d, 0x2e, 0x00, 0xee, 0x2e, - 0x98, 0xff, 0x39, 0xcb, 0x4d, 0x00, 0x1e, 0x6b, 0xea, 0xff, 0x80, 0x8e, 0x6c, 0xff, 0x97, - 0x25, 0x26, 0x00, 0x4d, 0x0d, 0x0a, 0x00, 0xca, 0x64, 0x7f, 0x00, 0xf4, 0xbe, 0xa1, 0xff, - 0xe2, 0x12, 0x6c, 0xff, 0xbd, 0x77, 0xae, 0xff, 0xf9, 0x4b, 0x36, 0x00, 0xb0, 0xe3, 0x79, - 0xff, 0xa2, 0x2a, 0x29, 0x00, 0xcd, 0x06, 0xbc, 0xff, 0x2d, 0xa3, 0x7e, 0x00, 0xa9, 0x08, - 0xa1, 0xff, 0xbf, 0x81, 0xd0, 0xff, 0x4f, 0x03, 0x73, 0x00, 0xb0, 0x99, 0x0c, 0x00, 0xbd, - 0x6f, 0xf8, 0xff, 0x6b, 0x02, 0x05, 0x00, 0xc1, 0xe1, 0xba, 0xff, 0x81, 0x69, 0x67, 0xff, - 0x7f, 0x96, 0x98, 0x00, 0x15, 0x00, 0x15, 0xd0, 0x06, 0x15, 0xda, 0x06, 0x2c, 0x15, 0x6a, - 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10, - 0xf3, 0x5a, 0x00, 0x00, 0x18, 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00, - 0x00, 0x00, 0xa8, 0x03, 0xf4, 0xa7, 0x01, 0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, - 0x55, 0x6f, 0xc5, 0xe4, 0x9f, 0x1a, 0x00, 0x00, 0x47, 0x89, 0x0a, 0xe8, 0x58, 0xf0, 0xff, - 0xff, 0x63, 0xee, 0x21, 0xdd, 0xdd, 0xca, 0xff, 0xff, 0xbe, 0x6f, 0x3b, 0xaa, 0xe9, 0x3d, - 0x00, 0x00, 0xd6, 0x91, 0x2a, 0xb7, 0x08, 0x02, 0x00, 0x00, 0x75, 0x45, 0x2c, 0xd7, 0x76, - 0x0c, 0x00, 0x00, 0x54, 0x49, 0x92, 0x44, 0x9c, 0xbf, 0xff, 0xff, 0x41, 0xa9, 0x6d, 0xec, - 0x7a, 0xd0, 0xff, 0xff, 0x27, 0xa0, 0x23, 0x41, 0x44, 0xc1, 0xff, 0xff, 0x18, 0xd4, 0xe1, - 0x30, 0xd3, 0xe0, 0xff, 0xff, 0x59, 0xac, 0x14, 0xf4, 0xec, 0x58, 0x00, 0x00, 0x2c, 0x17, - 0x29, 0x57, 0x44, 0x13, 0x00, 0x00, 0xa2, 0x0d, 0x4a, 0xcc, 0x63, 0xff, 0xff, 0xff, 0x81, - 0x33, 0xbc, 0xda, 0xd5, 0xda, 0xff, 0xff, 0x4c, 0x05, 0xf4, 0x78, 0x19, 0xea, 0xff, 0xff, - 0x06, 0x71, 0x25, 0xde, 0x5a, 0xaf, 0xff, 0xff, 0x95, 0x32, 0x5f, 0x76, 0x98, 0xb3, 0xff, - 0xff, 0xf1, 0x34, 0x3c, 0xbf, 0xa8, 0xbe, 0xff, 0xff, 0x27, 0x73, 0x40, 0x0c, 0x7d, 0xcd, - 0xff, 0xff, 0x68, 0xa9, 0xc2, 0xe9, 0x2c, 0x03, 0x00, 0x00, 0x3f, 0x79, 0xd9, 0x04, 0x8c, - 0xe5, 0xff, 0xff, 0x91, 0xb4, 0x9b, 0xe3, 0x8f, 0x21, 0x00, 0x00, 0xb8, 0x20, 0xc8, 0xc2, - 0x4d, 0xa6, 0xff, 0xff, 0x47, 0xfa, 0xde, 0x36, 0x4a, 0xf3, 0xff, 0xff, 0x72, 0x80, 0x94, - 0x59, 0xdd, 0x4e, 0x00, 0x00, 0x29, 0xe4, 0xd6, 0x43, 0xb0, 0xf0, 0xff, 0xff, 0x68, 0x36, - 0xbc, 0x2d, 0xd1, 0xa9, 0xff, 0xff, 0xbc, 0xe4, 0xbe, 0xd7, 0xed, 0x1b, 0x00, 0x00, 0x02, - 0x8b, 0xcb, 0xd7, 0xed, 0x47, 0x00, 0x00, 0x3c, 0x06, 0xe4, 0xda, 0xc7, 0x47, 0x00, 0x00, - 0xf3, 0x39, 0x55, 0x28, 0x97, 0xba, 0xff, 0xff, 0x07, 0x79, 0x38, 0x4e, 0xe0, 0x21, 0x00, - 0x00, 0xde, 0xed, 0x1c, 0x23, 0x09, 0x49, 0x00, 0x00, 0x49, 0x46, 0x49, 0x5d, 0x8f, 0x34, - 0x00, 0x00, 0x38, 0x18, 0x50, 0xf6, 0xa1, 0x11, 0x00, 0x00, 0xdf, 0xb8, 0x19, 0x14, 0xd1, - 0xe1, 0xff, 0xff, 0x2c, 0x56, 0x72, 0x93, 0x64, 0x3f, 0x00, 0x00, 0x1c, 0xe0, 0xbe, 0x87, - 0x7d, 0xf9, 0xff, 0xff, 0x73, 0x0e, 0x3c, 0x01, 0x91, 0xf9, 0xff, 0xff, 0xb2, 0x37, 0x85, - 0x81, 0x5f, 0x54, 0x00, 0x00, 0x58, 0x44, 0xb0, 0x1a, 0xac, 0xbb, 0xff, 0xff, 0x36, 0xbf, - 0xbe, 0x5e, 0x22, 0xff, 0xff, 0xff, 0x06, 0x20, 0xa0, 0x23, 0x0d, 0x3b, 0x00, 0x00, 0x19, - 0xc6, 0x49, 0x0a, 0x00, 0xcf, 0xff, 0xff, 0x4f, 0xcd, 0xc6, 0x95, 0x4b, 0xf1, 0xff, 0xff, - 0xa3, 0x59, 0xaf, 0x65, 0xec, 0xe9, 0xff, 0xff, 0x58, 0xef, 0x05, 0x50, 0x63, 0xe4, 0xff, - 0xff, 0xc7, 0x6a, 0x9e, 0xf1, 0x69, 0x20, 0x00, 0x00, 0xd1, 0xb3, 0xc9, 0x14, 0xb2, 0x29, - 0x00, 0x00, 0x1d, 0x48, 0x16, 0x70, 0xf0, 0x40, 0x00, 0x00, 0x01, 0xc0, 0x85, 0xef, 0x0c, - 0xa5, 0xff, 0xff, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x15, 0x00, 0x15, 0x90, - 0x0d, 0x15, 0x9a, 0x0d, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, - 0x02, 0x28, 0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, - 0xff, 0xff, 0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, - 0x75, 0xdd, 0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc8, 0x06, 0xf4, 0x47, 0x03, - 0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x05, 0x49, 0xf7, 0xfc, 0x89, 0x3d, 0x3e, - 0x20, 0x07, 0x72, 0x3e, 0xa1, 0x66, 0x81, 0x67, 0x80, 0x23, 0x78, 0x06, 0x68, 0x0e, 0x78, - 0xf5, 0x08, 0xed, 0x20, 0xcd, 0x0e, 0x7f, 0x9c, 0x70, 0xa0, 0xb9, 0x16, 0x44, 0xb2, 0x41, - 0x62, 0xba, 0x82, 0xad, 0xe1, 0x12, 0x9b, 0xa6, 0x53, 0x8d, 0x20, 0x27, 0xd5, 0x84, 0x63, - 0xb8, 0x07, 0x4b, 0x5b, 0xa4, 0x1c, 0xa4, 0x1c, 0x17, 0xbf, 0x4b, 0x00, 0x24, 0x04, 0x56, - 0xa8, 0x52, 0xaf, 0x33, 0xf7, 0xad, 0x7c, 0xc8, 0x83, 0x25, 0x13, 0xaf, 0x80, 0x25, 0x6f, - 0xbd, 0xd1, 0x15, 0x69, 0x64, 0x20, 0x7b, 0xd7, 0x33, 0xba, 0x66, 0x29, 0x8a, 0x00, 0xda, - 0x42, 0x07, 0x2c, 0x6c, 0x39, 0x76, 0x9f, 0xdc, 0x17, 0xad, 0xb6, 0x58, 0xdf, 0x5f, 0x00, - 0x18, 0x3a, 0xae, 0x1c, 0xd6, 0x5f, 0x9d, 0x78, 0x8d, 0x73, 0xdd, 0x3e, 0xd6, 0x18, 0x33, - 0x40, 0xe4, 0x36, 0xde, 0xb0, 0xb7, 0x33, 0x2a, 0x6b, 0x08, 0x03, 0x6c, 0x6d, 0x8f, 0x13, - 0x93, 0xd0, 0xd7, 0x87, 0x62, 0x63, 0x53, 0xfb, 0xd8, 0xbb, 0xc9, 0x54, 0x90, 0xd6, 0xa9, - 0x8f, 0xc8, 0x60, 0xbd, 0xec, 0x75, 0x23, 0x9a, 0x21, 0xec, 0xe4, 0x86, 0x43, 0xd7, 0xc1, - 0x88, 0xdc, 0x82, 0x00, 0x32, 0x79, 0xc9, 0x2b, 0x70, 0x85, 0xb7, 0x25, 0xa1, 0xcc, 0x7d, - 0x0b, 0x29, 0x03, 0xea, 0x80, 0xff, 0x9b, 0xf3, 0x24, 0x7f, 0xd1, 0xff, 0xf0, 0x22, 0x65, - 0x85, 0x99, 0x17, 0x63, 0xc2, 0xc0, 0xb7, 0x62, 0x05, 0xda, 0x7a, 0xa0, 0xc3, 0x2a, 0x6f, - 0x1f, 0xee, 0x1f, 0x31, 0xa8, 0x42, 0x80, 0xe4, 0xb7, 0x6c, 0xf6, 0xac, 0x47, 0xb0, 0x17, - 0x69, 0xcb, 0xff, 0x66, 0x8a, 0xd6, 0x25, 0x00, 0xf3, 0xcf, 0x0a, 0xaf, 0xf8, 0x92, 0x8a, - 0xa0, 0xdf, 0x71, 0x13, 0x8d, 0x9d, 0xff, 0x7e, 0xe0, 0x0a, 0x52, 0xf1, 0x97, 0x01, 0xa9, - 0x73, 0x27, 0xfd, 0x63, 0x58, 0x00, 0x32, 0xa6, 0xf6, 0x78, 0xb8, 0xe4, 0xfd, 0x20, 0x7c, - 0x90, 0xee, 0xad, 0x8c, 0xc9, 0x71, 0x35, 0x66, 0x71, 0x3c, 0xe0, 0xe4, 0x0b, 0xbb, 0xa0, - 0x50, 0xe9, 0xf2, 0x81, 0x1d, 0x3a, 0x95, 0x94, 0x00, 0xd5, 0x49, 0x00, 0x07, 0xdf, 0x21, - 0x53, 0x36, 0x8d, 0x9e, 0xd9, 0xa5, 0x52, 0x4d, 0x0d, 0x29, 0x74, 0xf0, 0x40, 0xbd, 0xda, - 0x63, 0x4e, 0xdd, 0x91, 0x8e, 0xa6, 0xa7, 0xf6, 0x78, 0x58, 0x3b, 0x0a, 0x5c, 0x60, 0x3c, - 0x15, 0x34, 0xf8, 0x2c, 0x21, 0xe3, 0x56, 0x1b, 0x9e, 0xd9, 0x56, 0xd3, 0x13, 0x2e, 0x80, - 0x2c, 0x36, 0xda, 0x1d, 0xc8, 0xfb, 0x52, 0xee, 0x17, 0xb3, 0x2b, 0xf3, 0xd2, 0xeb, 0x29, - 0xa0, 0x37, 0xa0, 0x12, 0xce, 0x1c, 0x50, 0x6a, 0xf4, 0x11, 0xcd, 0x96, 0x88, 0x3f, 0x43, - 0x78, 0xc0, 0x2c, 0x53, 0x6c, 0xa6, 0xdf, 0xb9, 0x9e, 0x93, 0xd4, 0x1e, 0xa9, 0x7f, 0x67, - 0xa6, 0xc1, 0x80, 0x46, 0x0f, 0x63, 0x7d, 0x15, 0xf2, 0x4c, 0xc5, 0xda, 0x11, 0x9a, 0x20, - 0x67, 0x27, 0xe8, 0x00, 0xec, 0x03, 0x1d, 0x15, 0xa7, 0x92, 0xb3, 0x1f, 0xda, 0x20, 0x92, - 0xd8, 0x00, 0xfb, 0x06, 0x80, 0xeb, 0x4b, 0x0c, 0xc1, 0x1f, 0x49, 0x40, 0x06, 0x8d, 0x8a, - 0xf8, 0x34, 0xb1, 0x0c, 0x1d, 0x20, 0xd0, 0x47, 0xe5, 0xb1, 0x7e, 0xf7, 0xe4, 0xb4, 0x7e, - 0x9c, 0x84, 0x18, 0x61, 0x32, 0x4f, 0xc0, 0xc2, 0xb2, 0xcc, 0x63, 0xf6, 0xe1, 0x16, 0xd6, - 0xd9, 0x4b, 0x74, 0x13, 0x01, 0xa1, 0xe2, 0x00, 0xb7, 0x9e, 0xc1, 0x3a, 0xc5, 0xaf, 0xe8, - 0x54, 0x07, 0x2a, 0x20, 0xfd, 0x2c, 0x6f, 0xb9, 0x80, 0x18, 0x92, 0x87, 0xa0, 0x81, 0x24, - 0x60, 0x47, 0x17, 0x4f, 0xbc, 0xbe, 0xf5, 0x03, 0x69, 0x80, 0xe3, 0x10, 0x54, 0xd6, 0x68, - 0x7d, 0x75, 0xd3, 0x0a, 0x45, 0x38, 0x9e, 0xa9, 0xfd, 0x05, 0x40, 0xd2, 0x1e, 0x6f, 0x5c, - 0x30, 0x10, 0xfe, 0x9b, 0x9f, 0x6d, 0xc0, 0x9d, 0x6c, 0x17, 0x7d, 0x00, 0x09, 0xb6, 0x8a, - 0x31, 0x8e, 0x1b, 0x6b, 0x84, 0x1e, 0x79, 0xce, 0x10, 0x55, 0x59, 0x6a, 0x40, 0x16, 0xdc, - 0x9a, 0xcf, 0x4d, 0xb0, 0x8f, 0xac, 0xe3, 0x8d, 0xee, 0xd2, 0xef, 0x01, 0x8c, 0xe0, 0x2b, - 0x24, 0xe5, 0xb4, 0xe1, 0x86, 0x72, 0x00, 0x30, 0x07, 0xce, 0x02, 0x23, 0x41, 0x33, 0x40, - 0xf0, 0x9b, 0xc2, 0x2d, 0x30, 0xec, 0x3b, 0x17, 0xb2, 0x8f, 0x64, 0x7d, 0xcd, 0x70, 0x9e, - 0x80, 0x22, 0xb5, 0xdf, 0x6d, 0x2a, 0x43, 0xd4, 0x2b, 0x5a, 0xf6, 0x96, 0xa6, 0xea, 0x91, - 0x62, 0x80, 0x39, 0xf2, 0x5a, 0x8e, 0xc0, 0xb9, 0x29, 0x99, 0x17, 0xe7, 0x35, 0x2c, 0xf6, - 0x4d, 0x18, 0x00, 0x48, 0x10, 0x85, 0xb4, 0x3f, 0x89, 0x60, 0x49, 0x6e, 0xf0, 0xcd, 0x9d, - 0x92, 0xeb, 0x96, 0x80, 0xcf, 0xf9, 0xf1, 0x46, 0x1d, 0xc0, 0x49, 0xb3, 0x36, 0x2e, 0x24, - 0xc8, 0xdb, 0x41, 0x72, 0x20, 0xf5, 0xde, 0x5c, 0xf9, 0x4a, 0x6e, 0xa0, 0x0b, 0x13, 0xfc, - 0x2d, 0x17, 0x07, 0x16, 0x5e, 0x00, 0x3c, 0x54, 0x41, 0x0e, 0xa2, 0x0d, 0xf3, 0x48, 0x12, - 0x2e, 0x7c, 0xab, 0x3c, 0x59, 0x1c, 0x40, 0xca, 0xb0, 0x71, 0xc7, 0x29, 0xf0, 0xbb, 0x9f, - 0xf4, 0x3f, 0x25, 0x49, 0xad, 0xc2, 0x8f, 0x80, 0x04, 0x38, 0x6d, 0x35, 0x02, 0xca, 0xe6, - 0x02, 0x83, 0x89, 0x4e, 0x74, 0xdb, 0x08, 0x5a, 0x80, 0x13, 0x99, 0xd4, 0x26, 0xc1, 0x27, - 0xce, 0xb0, 0x98, 0x99, 0xca, 0xf6, 0x3e, 0x50, 0x49, 0xd0, 0xbf, 0xcb, 0x6f, 0xbe, 0x5b, - 0x92, 0x63, 0xde, 0x94, 0xd3, 0x8f, 0x07, 0x06, 0x0f, 0x2b, 0x80, 0x36, 0xf1, 0x77, 0xf6, - 0x29, 0x33, 0x13, 0xa9, 0x4a, 0x55, 0x3d, 0x6c, 0xca, 0xdb, 0x4e, 0x40, 0xc4, 0x95, 0x54, - 0xf4, 0xe2, 0x8c, 0x1b, 0xa0, 0xfe, 0x30, 0x50, 0x9d, 0x62, 0xbc, 0x5c, 0x00, 0xb4, 0xc4, - 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, 0xc0, 0x00, 0x00, 0x00, 0x01, 0x4b, - 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff, 0xff, 0xff, - 0x15, 0x02, 0x19, 0x4c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, - 0x65, 0x6d, 0x61, 0x15, 0x06, 0x00, 0x15, 0x02, 0x25, 0x02, 0x18, 0x06, 0x64, 0x65, 0x63, - 0x37, 0x70, 0x34, 0x25, 0x0a, 0x15, 0x08, 0x15, 0x0e, 0x00, 0x15, 0x04, 0x25, 0x02, 0x18, - 0x07, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x25, 0x0a, 0x15, 0x0a, 0x15, 0x1c, 0x00, - 0x15, 0x0e, 0x15, 0x20, 0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31, - 0x38, 0x25, 0x0a, 0x15, 0x24, 0x15, 0x4c, 0x00, 0x16, 0x6a, 0x19, 0x1c, 0x19, 0x3c, 0x26, - 0x08, 0x1c, 0x15, 0x02, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06, 0x64, 0x65, 0x63, - 0x37, 0x70, 0x34, 0x15, 0x02, 0x16, 0x6a, 0x16, 0xf6, 0x03, 0x16, 0xfe, 0x03, 0x26, 0x08, - 0x3c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, - 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0x86, 0x04, - 0x1c, 0x15, 0x04, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x31, - 0x34, 0x70, 0x35, 0x15, 0x02, 0x16, 0x6a, 0x16, 0xa6, 0x07, 0x16, 0xb0, 0x07, 0x26, 0x86, - 0x04, 0x3c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x18, - 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, - 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xb6, 0x0b, 0x1c, 0x15, 0x0e, 0x19, 0x35, 0x06, - 0x08, 0x00, 0x19, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x15, 0x02, - 0x16, 0x6a, 0x16, 0x86, 0x0e, 0x16, 0x90, 0x0e, 0x26, 0xb6, 0x0b, 0x3c, 0x36, 0x02, 0x28, - 0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff, - 0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, - 0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, - 0x00, 0x00, 0x16, 0xa2, 0x19, 0x16, 0x6a, 0x00, 0x19, 0x2c, 0x18, 0x18, 0x6f, 0x72, 0x67, - 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x2e, 0x76, - 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x05, 0x33, 0x2e, 0x30, 0x2e, 0x31, 0x00, 0x18, - 0x29, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, - 0x72, 0x6b, 0x2e, 0x73, 0x71, 0x6c, 0x2e, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2e, - 0x72, 0x6f, 0x77, 0x2e, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xf4, 0x01, - 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, - 0x22, 0x2c, 0x22, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x22, 0x3a, 0x5b, 0x7b, 0x22, 0x6e, - 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x37, 0x70, 0x34, 0x22, 0x2c, 0x22, - 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, - 0x37, 0x2c, 0x34, 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, - 0x22, 0x3a, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x22, 0x3a, 0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, - 0x22, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, - 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x31, 0x34, 0x2c, 0x35, - 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, - 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, - 0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, - 0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, - 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x33, 0x38, 0x2c, 0x31, 0x38, 0x29, - 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, 0x72, - 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x7b, - 0x7d, 0x7d, 0x5d, 0x7d, 0x00, 0x18, 0x4a, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, - 0x6d, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x31, 0x30, - 0x2e, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x20, 0x61, 0x38, 0x39, 0x64, 0x66, - 0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66, 0x36, 0x36, 0x33, 0x33, 0x64, - 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, - 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c, - 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - unsigned int decimals_parquet_len = 2366; - - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(decimals_parquet), decimals_parquet_len}); - auto result = cudf::io::read_parquet(read_opts); - - auto validity = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 50; }); - - EXPECT_EQ(result.tbl->view().num_columns(), 3); - - int32_t col0_data[] = { - -2354584, -190275, 8393572, 6446515, -5687920, -1843550, -6897687, -6780385, 3428529, - 5842056, -4312278, -4450603, -7516141, 2974667, -4288640, 1065090, -9410428, 7891355, - 1076244, -1975984, 6999466, 2666959, 9262967, 7931374, -1370640, 451074, 8799111, - 3026424, -6803730, 5098297, -1414370, -9662848, 2499991, 658765, 8348874, -6177036, - -9694494, -5343299, 3558393, -8789072, 2697890, -4454707, 8299309, -6223703, -3112513, - 7537487, 825776, -495683, 328299, -4529727, 0, -9999999, 9999999}; - - EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), - sizeof(col0_data) / sizeof(col0_data[0])); - cudf::test::fixed_point_column_wrapper col0( - std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - - int64_t col1_data[] = {29274040266581, -17210335917753, -58420730139037, - 68073792696254, 2236456014294, 13704555677045, - -70797090469548, -52248605513407, -68976081919961, - -34277313883112, 97774730521689, 21184241014572, - -670882460254, -40862944054399, -24079852370612, - -88670167797498, -84007574359403, -71843004533519, - -55538016554201, 3491435293032, -29085437167297, - 36901882672273, -98622066122568, -13974902998457, - 86712597643378, -16835133643735, -94759096142232, - 30708340810940, 79086853262082, 78923696440892, - -76316597208589, 37247268714759, 80303592631774, - 57790350050889, 19387319851064, -33186875066145, - 69701203023404, -7157433049060, -7073790423437, - 92769171617714, -75127120182184, -951893180618, - 64927618310150, -53875897154023, -16168039035569, - -24273449166429, -30359781249192, 35639397345991, - 45844829680593, 71401416837149, 0, - -99999999999999, 99999999999999}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); - cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); - - cudf::io::parquet_reader_options read_strict_opts = read_opts; - read_strict_opts.set_columns({"dec7p4", "dec14p5"}); - EXPECT_NO_THROW(cudf::io::read_parquet(read_strict_opts)); - } - { - // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4) - // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6) - // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9) - unsigned char const fixed_len_bytes_decimal_parquet[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28, - 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, - 0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00, - 0x00, 0x26, 0x01, 0x03, 0x00, 0x00, 0x61, 0x10, 0xCF, 0x00, 0x0A, 0xA9, 0x08, 0x00, 0x77, - 0x58, 0x6F, 0x00, 0x6B, 0xEE, 0xA4, 0x00, 0x92, 0xF8, 0x94, 0x00, 0x2E, 0x18, 0xD4, 0x00, - 0x4F, 0x45, 0x33, 0x00, 0x97, 0x45, 0x72, 0x00, 0x0D, 0xC2, 0x75, 0x00, 0x76, 0xAA, 0xAA, - 0x00, 0x30, 0x9F, 0x86, 0x00, 0x4B, 0x9D, 0xB1, 0x00, 0x4E, 0x4B, 0x3B, 0x00, 0x01, 0x81, - 0x3B, 0x00, 0x22, 0xD4, 0x53, 0x00, 0x72, 0xC4, 0xAF, 0x00, 0x43, 0x9B, 0x72, 0x00, 0x1D, - 0x91, 0xC3, 0x00, 0x45, 0x27, 0x48, 0x15, 0x00, 0x15, 0xF4, 0x01, 0x15, 0xFA, 0x01, 0x2C, - 0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x06, 0x00, 0xD5, - 0xD7, 0x31, 0x99, 0xA6, 0x18, 0x06, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x00, 0x00, - 0x7A, 0xF0, 0x79, 0x04, 0x00, 0x00, 0x00, 0x24, 0x01, 0x03, 0x02, 0x00, 0x54, 0x23, 0xCF, - 0x13, 0x0A, 0x00, 0x07, 0x22, 0xB1, 0x21, 0x7E, 0x00, 0x64, 0x19, 0xD6, 0xD2, 0xA5, 0x00, - 0x61, 0x7F, 0xF6, 0xB9, 0xB0, 0x00, 0xD0, 0x7F, 0x9C, 0xA9, 0xE9, 0x00, 0x65, 0x58, 0xF0, - 0xAD, 0xFB, 0x00, 0xBC, 0x61, 0xE2, 0x03, 0xDA, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, - 0x63, 0x4B, 0x4C, 0xFE, 0x45, 0x00, 0x7A, 0xA0, 0xD8, 0xD1, 0xC0, 0x00, 0xC0, 0x63, 0xF7, - 0x9D, 0x0A, 0x00, 0x88, 0x22, 0x0F, 0x1B, 0x25, 0x00, 0x1A, 0x80, 0x56, 0x34, 0xC7, 0x00, - 0x5F, 0x48, 0x61, 0x09, 0x7C, 0x00, 0x61, 0xEF, 0x92, 0x42, 0x2F, 0x00, 0xD5, 0xD7, 0x31, - 0x99, 0xA6, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x71, 0xDD, 0xE2, 0x22, 0x7B, 0x00, - 0x54, 0xBF, 0xAE, 0xE9, 0x3C, 0x15, 0x00, 0x15, 0xD4, 0x02, 0x15, 0xDC, 0x02, 0x2C, 0x15, - 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x04, 0x28, 0x09, 0x00, 0x7D, 0xFE, - 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x18, 0x09, 0x00, 0x03, 0x9C, 0xCD, 0x5A, 0xAC, 0xBB, - 0xF1, 0xE3, 0x00, 0x00, 0x00, 0xAA, 0x01, 0xF0, 0xA9, 0x04, 0x00, 0x00, 0x00, 0x07, 0xBF, - 0xBF, 0x0F, 0x00, 0x7D, 0xFE, 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x00, 0x7D, 0x9A, 0xCB, - 0xDA, 0x4B, 0x10, 0x8B, 0xAC, 0x00, 0x20, 0xBA, 0x97, 0x87, 0x2E, 0x3B, 0x4E, 0x04, 0x00, - 0x15, 0xBB, 0xC2, 0xDF, 0x2D, 0x25, 0x08, 0xB6, 0x00, 0x5C, 0x67, 0x0E, 0x36, 0x30, 0xF1, - 0xAC, 0xA4, 0x00, 0x44, 0xF1, 0x8E, 0xFB, 0x17, 0x5E, 0xE1, 0x96, 0x00, 0x64, 0x69, 0xF9, - 0x66, 0x3F, 0x11, 0xED, 0xB9, 0x00, 0x45, 0xB5, 0xDA, 0x14, 0x9C, 0xA3, 0xFA, 0x64, 0x00, - 0x26, 0x5F, 0xDE, 0xD7, 0x67, 0x95, 0xEF, 0xB1, 0x00, 0x35, 0xDB, 0x9B, 0x88, 0x46, 0xD0, - 0xA1, 0x0E, 0x00, 0x45, 0xA9, 0x92, 0x8E, 0x89, 0xD1, 0xAC, 0x4C, 0x00, 0x4C, 0xF1, 0xCB, - 0x27, 0x82, 0x3A, 0x7D, 0xB7, 0x00, 0x64, 0xD3, 0xD2, 0x2F, 0x9C, 0x83, 0x16, 0x75, 0x00, - 0x15, 0xDF, 0xC2, 0xA9, 0x63, 0xB8, 0x33, 0x65, 0x00, 0x27, 0x40, 0x28, 0x97, 0x05, 0x8E, - 0xE3, 0x46, 0x00, 0x03, 0x9C, 0xCD, 0x5A, 0xAC, 0xBB, 0xF1, 0xE3, 0x00, 0x22, 0x23, 0xF5, - 0xE8, 0x9D, 0x55, 0xD4, 0x9C, 0x00, 0x25, 0xB9, 0xD8, 0x87, 0x2D, 0xF1, 0xF2, 0x17, 0x15, - 0x02, 0x19, 0x4C, 0x48, 0x0C, 0x73, 0x70, 0x61, 0x72, 0x6B, 0x5F, 0x73, 0x63, 0x68, 0x65, - 0x6D, 0x61, 0x15, 0x06, 0x00, 0x15, 0x0E, 0x15, 0x08, 0x15, 0x02, 0x18, 0x06, 0x64, 0x65, - 0x63, 0x37, 0x70, 0x33, 0x25, 0x0A, 0x15, 0x06, 0x15, 0x0E, 0x00, 0x15, 0x0E, 0x15, 0x0C, - 0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x25, 0x0A, 0x15, - 0x16, 0x15, 0x18, 0x00, 0x15, 0x0E, 0x15, 0x12, 0x15, 0x02, 0x18, 0x07, 0x64, 0x65, 0x63, - 0x32, 0x30, 0x70, 0x31, 0x25, 0x0A, 0x15, 0x02, 0x15, 0x28, 0x00, 0x16, 0x28, 0x19, 0x1C, - 0x19, 0x3C, 0x26, 0x08, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06, - 0x64, 0x65, 0x63, 0x37, 0x70, 0x33, 0x15, 0x02, 0x16, 0x28, 0x16, 0xEE, 0x01, 0x16, 0xF4, - 0x01, 0x26, 0x08, 0x3C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00, - 0x01, 0x81, 0x3B, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, - 0x26, 0xFC, 0x01, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x08, 0x64, - 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x15, 0x02, 0x16, 0x28, 0x16, 0xC2, 0x02, 0x16, - 0xC8, 0x02, 0x26, 0xFC, 0x01, 0x3C, 0x36, 0x02, 0x28, 0x06, 0x00, 0xD5, 0xD7, 0x31, 0x99, - 0xA6, 0x18, 0x06, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, - 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xC4, 0x04, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, - 0x08, 0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x32, 0x30, 0x70, 0x31, 0x15, 0x02, 0x16, - 0x28, 0x16, 0xAE, 0x03, 0x16, 0xB6, 0x03, 0x26, 0xC4, 0x04, 0x3C, 0x36, 0x04, 0x28, 0x09, - 0x00, 0x7D, 0xFE, 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x18, 0x09, 0x00, 0x03, 0x9C, 0xCD, - 0x5A, 0xAC, 0xBB, 0xF1, 0xE3, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, - 0x00, 0x00, 0x16, 0xDE, 0x07, 0x16, 0x28, 0x00, 0x19, 0x2C, 0x18, 0x18, 0x6F, 0x72, 0x67, - 0x2E, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2E, 0x73, 0x70, 0x61, 0x72, 0x6B, 0x2E, 0x76, - 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x18, 0x05, 0x33, 0x2E, 0x30, 0x2E, 0x31, 0x00, 0x18, - 0x29, 0x6F, 0x72, 0x67, 0x2E, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2E, 0x73, 0x70, 0x61, - 0x72, 0x6B, 0x2E, 0x73, 0x71, 0x6C, 0x2E, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2E, - 0x72, 0x6F, 0x77, 0x2E, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xF4, 0x01, - 0x7B, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, - 0x22, 0x2C, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x73, 0x22, 0x3A, 0x5B, 0x7B, 0x22, 0x6E, - 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x37, 0x70, 0x33, 0x22, 0x2C, 0x22, - 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, - 0x37, 0x2C, 0x33, 0x29, 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, - 0x22, 0x3A, 0x74, 0x72, 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x22, 0x3A, 0x7B, 0x7D, 0x7D, 0x2C, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, - 0x22, 0x64, 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x22, 0x2C, 0x22, 0x74, 0x79, 0x70, - 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, 0x31, 0x32, 0x2C, - 0x31, 0x31, 0x29, 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, 0x22, - 0x3A, 0x74, 0x72, 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, - 0x22, 0x3A, 0x7B, 0x7D, 0x7D, 0x2C, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x22, - 0x64, 0x65, 0x63, 0x32, 0x30, 0x70, 0x31, 0x22, 0x2C, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, - 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, 0x32, 0x30, 0x2C, 0x31, 0x29, - 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, 0x22, 0x3A, 0x74, 0x72, - 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x7B, - 0x7D, 0x7D, 0x5D, 0x7D, 0x00, 0x18, 0x4A, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2D, - 0x6D, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x20, 0x31, 0x2E, 0x31, 0x30, - 0x2E, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6C, 0x64, 0x20, 0x61, 0x38, 0x39, 0x64, 0x66, - 0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66, 0x36, 0x36, 0x33, 0x33, 0x64, - 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30, - 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, - 0x00, 0x00, 0x00, 0xC5, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31, - }; - - unsigned int parquet_len = 1226; - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{ - reinterpret_cast(fixed_len_bytes_decimal_parquet), parquet_len}); - auto result = cudf::io::read_parquet(read_opts); - EXPECT_EQ(result.tbl->view().num_columns(), 3); - - auto validity_c0 = cudf::test::iterators::nulls_at({19}); - int32_t col0_data[] = {6361295, 698632, 7821423, 7073444, 9631892, 3021012, 5195059, - 9913714, 901749, 7776938, 3186566, 4955569, 5131067, 98619, - 2282579, 7521455, 4430706, 1937859, 4532040, 0}; - - EXPECT_EQ(static_cast(result.tbl->view().column(0).size()), - sizeof(col0_data) / sizeof(col0_data[0])); - cudf::test::fixed_point_column_wrapper col0( - std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), col0); - - auto validity_c1 = cudf::test::iterators::nulls_at({18}); - int64_t col1_data[] = {361378026250, - 30646804862, - 429930238629, - 418758703536, - 895494171113, - 435283865083, - 809096053722, - -999999999999, - 426465099333, - 526684574144, - 826310892810, - 584686967589, - 113822282951, - 409236212092, - 420631167535, - 918438386086, - -999999999999, - 489053889147, - 0, - 363993164092}; - - EXPECT_EQ(static_cast(result.tbl->view().column(1).size()), - sizeof(col1_data) / sizeof(col1_data[0])); - cudf::test::fixed_point_column_wrapper col1( - std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1); - - auto validity_c2 = cudf::test::iterators::nulls_at({6, 14}); - __int128_t col2_data[] = {9078697037144433659, - 9050770539577117612, - 2358363961733893636, - 1566059559232276662, - 6658306200002735268, - 4967909073046397334, - 0, - 7235588493887532473, - 5023160741463849572, - 2765173712965988273, - 3880866513515749646, - 5019704400576359500, - 5544435986818825655, - 7265381725809874549, - 0, - 1576192427381240677, - 2828305195087094598, - 260308667809395171, - 2460080200895288476, - 2718441925197820439}; - - EXPECT_EQ(static_cast(result.tbl->view().column(2).size()), - sizeof(col2_data) / sizeof(col2_data[0])); - cudf::test::fixed_point_column_wrapper<__int128_t> col2( - std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(2), col2); - } -} - -TEST_F(ParquetReaderTest, EmptyOutput) -{ - cudf::test::fixed_width_column_wrapper c0; - cudf::test::strings_column_wrapper c1; - cudf::test::fixed_point_column_wrapper c2({}, numeric::scale_type{2}); - cudf::test::lists_column_wrapper _c3{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}; - auto c3 = cudf::empty_like(_c3); - - cudf::test::fixed_width_column_wrapper sc0; - cudf::test::strings_column_wrapper sc1; - cudf::test::lists_column_wrapper _sc2{{1, 2}}; - std::vector> struct_children; - struct_children.push_back(sc0.release()); - struct_children.push_back(sc1.release()); - struct_children.push_back(cudf::empty_like(_sc2)); - cudf::test::structs_column_wrapper c4(std::move(struct_children)); - - table_view expected({c0, c1, c2, *c3, c4}); - - // set precision on the decimal column - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[2].set_decimal_precision(1); - - auto filepath = temp_env->get_temp_filepath("EmptyOutput.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - out_args.set_metadata(std::move(expected_metadata)); - cudf::io::write_parquet(out_args); - - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_args); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TEST_F(ParquetWriterTest, RowGroupSizeInvalid) -{ - auto const unused_table = std::make_unique(); - std::vector out_buffer; - - EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), - unused_table->view()) - .row_group_size_rows(0), - cudf::logic_error); - EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), - unused_table->view()) - .max_page_size_rows(0), - cudf::logic_error); - EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), - unused_table->view()) - .row_group_size_bytes(3 << 8), - cudf::logic_error); - EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), - unused_table->view()) - .max_page_size_bytes(3 << 8), - cudf::logic_error); - EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), - unused_table->view()) - .max_page_size_bytes(0xFFFF'FFFFUL), - cudf::logic_error); - - EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) - .row_group_size_rows(0), - cudf::logic_error); - EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) - .max_page_size_rows(0), - cudf::logic_error); - EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) - .row_group_size_bytes(3 << 8), - cudf::logic_error); - EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) - .max_page_size_bytes(3 << 8), - cudf::logic_error); - EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) - .max_page_size_bytes(0xFFFF'FFFFUL), - cudf::logic_error); -} - -TEST_F(ParquetWriterTest, RowGroupPageSizeMatch) -{ - auto const unused_table = std::make_unique
(); - std::vector out_buffer; - - auto options = cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), - unused_table->view()) - .row_group_size_bytes(128 * 1024) - .max_page_size_bytes(512 * 1024) - .row_group_size_rows(10000) - .max_page_size_rows(20000) - .build(); - EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); - EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); -} - -TEST_F(ParquetChunkedWriterTest, RowGroupPageSizeMatch) -{ - std::vector out_buffer; - - auto options = cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) - .row_group_size_bytes(128 * 1024) - .max_page_size_bytes(512 * 1024) - .row_group_size_rows(10000) - .max_page_size_rows(20000) - .build(); - EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); - EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); -} - -TEST_F(ParquetWriterTest, EmptyList) -{ - auto L1 = cudf::make_lists_column(0, - cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), - cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}), - 0, - {}); - auto L0 = cudf::make_lists_column( - 3, cudf::test::fixed_width_column_wrapper{0, 0, 0, 0}.release(), std::move(L1), 0, {}); - - auto filepath = temp_env->get_temp_filepath("EmptyList.parquet"); - cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath), - cudf::table_view({*L0}))); - - auto result = cudf::io::read_parquet( - cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath))); - - using lcw = cudf::test::lists_column_wrapper; - auto expected = lcw{lcw{}, lcw{}, lcw{}}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), expected); -} - -TEST_F(ParquetWriterTest, DeepEmptyList) -{ - // Make a list column LLLi st only L is valid and LLi are all null. This tests whether we can - // handle multiple nullptr offsets - - auto L2 = cudf::make_lists_column(0, - cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), - cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}), - 0, - {}); - auto L1 = cudf::make_lists_column( - 0, cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), std::move(L2), 0, {}); - auto L0 = cudf::make_lists_column( - 3, cudf::test::fixed_width_column_wrapper{0, 0, 0, 0}.release(), std::move(L1), 0, {}); - - auto filepath = temp_env->get_temp_filepath("DeepEmptyList.parquet"); - cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath), - cudf::table_view({*L0}))); - - auto result = cudf::io::read_parquet( - cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath))); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), *L0); -} - -TEST_F(ParquetWriterTest, EmptyListWithStruct) -{ - auto L2 = cudf::make_lists_column(0, - cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), - cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}), - 0, - {}); - - auto children = std::vector>{}; - children.push_back(std::move(L2)); - auto S2 = cudf::make_structs_column(0, std::move(children), 0, {}); - auto L1 = cudf::make_lists_column( - 0, cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), std::move(S2), 0, {}); - auto L0 = cudf::make_lists_column( - 3, cudf::test::fixed_width_column_wrapper{0, 0, 0, 0}.release(), std::move(L1), 0, {}); - - auto filepath = temp_env->get_temp_filepath("EmptyListWithStruct.parquet"); - cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath), - cudf::table_view({*L0}))); - auto result = cudf::io::read_parquet( - cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath))); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), *L0); -} - -TEST_F(ParquetWriterTest, CheckPageRows) -{ - auto sequence = thrust::make_counting_iterator(0); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - - constexpr auto page_rows = 5000; - constexpr auto num_rows = 2 * page_rows; - column_wrapper col(sequence, sequence + num_rows, validity); - - auto expected = table_view{{col}}; - - auto const filepath = temp_env->get_temp_filepath("CheckPageRows.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .max_page_size_rows(page_rows); - cudf::io::write_parquet(out_opts); - - // check first page header and make sure it has only page_rows values - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - ASSERT_GT(fmd.row_groups.size(), 0); - ASSERT_EQ(fmd.row_groups[0].columns.size(), 1); - auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; - ASSERT_GT(first_chunk.data_page_offset, 0); - - // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded - // version should be smaller than size of the struct. - auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); - - EXPECT_EQ(ph.data_page_header.num_values, page_rows); -} - -TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) -{ - // enough for a few pages with the default 20'000 rows/page - constexpr auto rows_per_page = 20'000; - constexpr auto num_rows = 3 * rows_per_page; - const std::string s1(32, 'a'); - auto col0_elements = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; }); - auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows); - - auto const expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("CheckPageRowsAdjusted.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .max_page_size_rows(rows_per_page); - cudf::io::write_parquet(out_opts); - - // check first page header and make sure it has only page_rows values - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - ASSERT_GT(fmd.row_groups.size(), 0); - ASSERT_EQ(fmd.row_groups[0].columns.size(), 1); - auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; - ASSERT_GT(first_chunk.data_page_offset, 0); - - // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded - // version should be smaller than size of the struct. - auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); - - EXPECT_LE(ph.data_page_header.num_values, rows_per_page); -} - -TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) -{ - constexpr auto rows_per_page = 1'000; - constexpr auto fragment_size = 5'000; - constexpr auto num_rows = 3 * rows_per_page; - const std::string s1(32, 'a'); - auto col0_elements = - cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; }); - auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows); - - auto const expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("CheckPageRowsTooSmall.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .max_page_fragment_size(fragment_size) - .max_page_size_rows(rows_per_page); - cudf::io::write_parquet(out_opts); - - // check that file is written correctly when rows/page < fragment size - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - ASSERT_TRUE(fmd.row_groups.size() > 0); - ASSERT_TRUE(fmd.row_groups[0].columns.size() == 1); - auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; - ASSERT_TRUE(first_chunk.data_page_offset > 0); - - // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded - // version should be smaller than size of the struct. - auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); - - // there should be only one page since the fragment size is larger than rows_per_page - EXPECT_EQ(ph.data_page_header.num_values, num_rows); -} - -TEST_F(ParquetWriterTest, Decimal128Stats) -{ - // check that decimal128 min and max statistics are written in network byte order - // this is negative, so should be the min - std::vector expected_min{ - 0xa1, 0xb2, 0xc3, 0xd4, 0xe5, 0xf6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - std::vector expected_max{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa1, 0xb2, 0xc3, 0xd4, 0xe5, 0xf6}; - - __int128_t val0 = 0xa1b2'c3d4'e5f6ULL; - __int128_t val1 = val0 << 80; - column_wrapper col0{{numeric::decimal128(val0, numeric::scale_type{0}), - numeric::decimal128(val1, numeric::scale_type{0})}}; - - auto expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("Decimal128Stats.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - auto const stats = get_statistics(fmd.row_groups[0].columns[0]); - - EXPECT_EQ(expected_min, stats.min_value); - EXPECT_EQ(expected_max, stats.max_value); -} - -// ============================================================================= -// ---- test data for stats sort order tests -// need at least 3 pages, and min page count is 5000, so need at least 15000 values. -// use 20000 to be safe. -static constexpr int num_ordered_rows = 20000; -static constexpr int page_size_for_ordered_tests = 5000; - -namespace { -namespace testdata { -// ----- most numerics. scale by 100 so all values fit in a single byte - -template -std::enable_if_t && !std::is_same_v, - cudf::test::fixed_width_column_wrapper> -ascending() -{ - int start = std::is_signed_v ? -num_ordered_rows / 2 : 0; - auto elements = - cudf::detail::make_counting_transform_iterator(start, [](auto i) { return i / 100; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t && !std::is_same_v, - cudf::test::fixed_width_column_wrapper> -descending() -{ - if (std::is_signed_v) { - auto elements = cudf::detail::make_counting_transform_iterator(-num_ordered_rows / 2, - [](auto i) { return -i / 100; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); - } else { - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return (num_ordered_rows - i) / 100; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); - } -} - -template -std::enable_if_t && !std::is_same_v, - cudf::test::fixed_width_column_wrapper> -unordered() -{ - if (std::is_signed_v) { - auto elements = cudf::detail::make_counting_transform_iterator( - -num_ordered_rows / 2, [](auto i) { return (i % 2 ? i : -i) / 100; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); - } else { - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return (i % 2 ? i : num_ordered_rows - i) / 100; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); - } -} - -// ----- bool - -template -std::enable_if_t, cudf::test::fixed_width_column_wrapper> ascending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i >= num_ordered_rows / 2; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t, cudf::test::fixed_width_column_wrapper> descending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i < num_ordered_rows / 2; }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t, cudf::test::fixed_width_column_wrapper> unordered() -{ - auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - switch (i / page_size_for_ordered_tests) { - case 0: return true; - case 1: return false; - case 2: return true; - default: return false; - } - }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -// ----- fixed point types - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - -num_ordered_rows / 2, [](auto i) { return T(i, numeric::scale_type{0}); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - -num_ordered_rows / 2, [](auto i) { return T(-i, numeric::scale_type{0}); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - -num_ordered_rows / 2, [](auto i) { return T(i % 2 ? i : -i, numeric::scale_type{0}); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -// ----- chrono types -// ----- timstamp - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return T(typename T::duration(i)); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return T(typename T::duration(num_ordered_rows - i)); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return T(typename T::duration(i % 2 ? i : num_ordered_rows - i)); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -// ----- duration - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> ascending() -{ - auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return T(i); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> descending() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return T(num_ordered_rows - i); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t(), cudf::test::fixed_width_column_wrapper> unordered() -{ - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return T(i % 2 ? i : num_ordered_rows - i); }); - return cudf::test::fixed_width_column_wrapper(elements, elements + num_ordered_rows); -} - -// ----- string_view - -template -std::enable_if_t, cudf::test::strings_column_wrapper> -ascending() -{ - char buf[10]; - auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", i); - return std::string(buf); - }); - return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t, cudf::test::strings_column_wrapper> -descending() -{ - char buf[10]; - auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", num_ordered_rows - i); - return std::string(buf); - }); - return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); -} - -template -std::enable_if_t, cudf::test::strings_column_wrapper> -unordered() -{ - char buf[10]; - auto elements = cudf::detail::make_counting_transform_iterator(0, [&buf](auto i) { - sprintf(buf, "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); - return std::string(buf); - }); - return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); -} - -} // namespace testdata -} // anonymous namespace - -TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) -{ - using T = TypeParam; - - auto col0 = testdata::ascending(); - auto col1 = testdata::descending(); - auto col2 = testdata::unordered(); - - auto const expected = table_view{{col0, col1, col2}}; - - auto const filepath = temp_env->get_temp_filepath("ThreeColumnSorted.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .max_page_size_rows(page_size_for_ordered_tests) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - ASSERT_GT(fmd.row_groups.size(), 0); - - auto const& columns = fmd.row_groups[0].columns; - ASSERT_EQ(columns.size(), static_cast(expected.num_columns())); - - // now check that the boundary order for chunk 1 is ascending, - // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { - cudf::io::parquet::detail::BoundaryOrder::ASCENDING, - cudf::io::parquet::detail::BoundaryOrder::DESCENDING, - cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; - - for (std::size_t i = 0; i < columns.size(); i++) { - auto const ci = read_column_index(source, columns[i]); - EXPECT_EQ(ci.boundary_order, expected_orders[i]); - } -} - -// utility functions for column index tests - -// compare two values. return -1 if v1 < v2, -// 0 if v1 == v2, and 1 if v1 > v2. -template -int32_t compare(T& v1, T& v2) -{ - return (v1 > v2) - (v1 < v2); -} - -// compare two binary statistics blobs based on their physical -// and converted types. returns -1 if v1 < v2, 0 if v1 == v2, and -// 1 if v1 > v2. -int32_t compare_binary(std::vector const& v1, - std::vector const& v2, - cudf::io::parquet::detail::Type ptype, - thrust::optional const& ctype) -{ - auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); - switch (ptype) { - case cudf::io::parquet::detail::INT32: - switch (ctype_val) { - case cudf::io::parquet::detail::UINT_8: - case cudf::io::parquet::detail::UINT_16: - case cudf::io::parquet::detail::UINT_32: - return compare(*(reinterpret_cast(v1.data())), - *(reinterpret_cast(v2.data()))); - default: - return compare(*(reinterpret_cast(v1.data())), - *(reinterpret_cast(v2.data()))); - } - - case cudf::io::parquet::detail::INT64: - if (ctype_val == cudf::io::parquet::detail::UINT_64) { - return compare(*(reinterpret_cast(v1.data())), - *(reinterpret_cast(v2.data()))); - } - return compare(*(reinterpret_cast(v1.data())), - *(reinterpret_cast(v2.data()))); - - case cudf::io::parquet::detail::FLOAT: - return compare(*(reinterpret_cast(v1.data())), - *(reinterpret_cast(v2.data()))); - - case cudf::io::parquet::detail::DOUBLE: - return compare(*(reinterpret_cast(v1.data())), - *(reinterpret_cast(v2.data()))); - - case cudf::io::parquet::detail::BYTE_ARRAY: { - int32_t v1sz = v1.size(); - int32_t v2sz = v2.size(); - int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); - if (ret != 0 or v1sz == v2sz) { return ret; } - return v1sz - v2sz; - } - - default: CUDF_FAIL("Invalid type in compare_binary"); - } - - return 0; -} - -TEST_P(ParquetV2Test, LargeColumnIndex) -{ - // create a file large enough to be written in 2 batches (currently 1GB per batch) - // pick fragment size that num_rows is divisible by, so we'll get equal sized row groups - const std::string s1(1000, 'a'); - const std::string s2(1000, 'b'); - constexpr auto num_rows = 512 * 1024; - constexpr auto frag_size = num_rows / 128; - auto const is_v2 = GetParam(); - - auto col0_elements = cudf::detail::make_counting_transform_iterator( - 0, [&](auto i) { return (i < num_rows) ? s1 : s2; }); - auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + 2 * num_rows); - - auto const expected = table_view{{col0, col0}}; - - auto const filepath = temp_env->get_temp_filepath("LargeColumnIndex.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .compression(cudf::io::compression_type::NONE) - .dictionary_policy(cudf::io::dictionary_policy::NEVER) - .write_v2_headers(is_v2) - .max_page_fragment_size(frag_size) - .row_group_size_bytes(1024 * 1024 * 1024) - .row_group_size_rows(num_rows); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (auto const& rg : fmd.row_groups) { - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max - auto const ptype = fmd.schema[c + 1].type; - auto const ctype = fmd.schema[c + 1].converted_type; - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); - } - } -} - -TEST_P(ParquetV2Test, CheckColumnOffsetIndex) -{ - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 - : cudf::io::parquet::detail::PageType::DATA_PAGE; - - // fixed length strings - auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); - }); - auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); - - auto col1_data = random_values(num_rows); - auto col2_data = random_values(num_rows); - auto col3_data = random_values(num_rows); - auto col4_data = random_values(num_rows); - auto col5_data = random_values(num_rows); - auto col6_data = random_values(num_rows); - - auto col1 = cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end()); - auto col2 = cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end()); - auto col3 = cudf::test::fixed_width_column_wrapper(col3_data.begin(), col3_data.end()); - auto col4 = cudf::test::fixed_width_column_wrapper(col4_data.begin(), col4_data.end()); - auto col5 = cudf::test::fixed_width_column_wrapper(col5_data.begin(), col5_data.end()); - auto col6 = cudf::test::fixed_width_column_wrapper(col6_data.begin(), col6_data.end()); - - // mixed length strings - auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); - }); - auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); - - auto const expected = table_view{{col0, col1, col2, col3, col4, col5, col6, col7}}; - - auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndex.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .write_v2_headers(is_v2) - .max_page_size_rows(20000); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - // loop over offsets, read each page header, make sure it's a data page and that - // the first row index is correct - auto const oi = read_offset_index(source, chunk); - - int64_t num_vals = 0; - for (size_t o = 0; o < oi.page_locations.size(); o++) { - auto const& page_loc = oi.page_locations[o]; - auto const ph = read_page_header(source, page_loc); - EXPECT_EQ(ph.type, expected_hdr_type); - EXPECT_EQ(page_loc.first_row_index, num_vals); - num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; - } - - // loop over page stats from the column index. check that stats.min <= page.min - // and stats.max >= page.max for each page. - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - ASSERT_TRUE(ci.null_counts.has_value()); - - // schema indexing starts at 1 - auto const ptype = fmd.schema[c + 1].type; - auto const ctype = fmd.schema[c + 1].converted_type; - for (size_t p = 0; p < ci.min_values.size(); p++) { - // null_pages should always be false - EXPECT_FALSE(ci.null_pages[p]); - // null_counts should always be 0 - EXPECT_EQ(ci.null_counts.value()[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); - } - for (size_t p = 0; p < ci.max_values.size(); p++) - EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); - } - } -} - -TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) -{ - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 - : cudf::io::parquet::detail::PageType::DATA_PAGE; - - // fixed length strings - auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); - }); - auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); - - auto col1_data = random_values(num_rows); - auto col2_data = random_values(num_rows); - auto col3_data = random_values(num_rows); - auto col4_data = random_values(num_rows); - auto col5_data = random_values(num_rows); - auto col6_data = random_values(num_rows); - - auto valids = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - - // add null values for all but first column - auto col1 = - cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end(), valids); - auto col2 = - cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end(), valids); - auto col3 = - cudf::test::fixed_width_column_wrapper(col3_data.begin(), col3_data.end(), valids); - auto col4 = - cudf::test::fixed_width_column_wrapper(col4_data.begin(), col4_data.end(), valids); - auto col5 = - cudf::test::fixed_width_column_wrapper(col5_data.begin(), col5_data.end(), valids); - auto col6 = - cudf::test::fixed_width_column_wrapper(col6_data.begin(), col6_data.end(), valids); - - // mixed length strings - auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); - }); - auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); - - auto expected = table_view{{col0, col1, col2, col3, col4, col5, col6, col7}}; - - auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexNulls.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .write_v2_headers(is_v2) - .max_page_size_rows(20000); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - // loop over offsets, read each page header, make sure it's a data page and that - // the first row index is correct - auto const oi = read_offset_index(source, chunk); - - int64_t num_vals = 0; - for (size_t o = 0; o < oi.page_locations.size(); o++) { - auto const& page_loc = oi.page_locations[o]; - auto const ph = read_page_header(source, page_loc); - EXPECT_EQ(ph.type, expected_hdr_type); - EXPECT_EQ(page_loc.first_row_index, num_vals); - num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; - } - - // loop over page stats from the column index. check that stats.min <= page.min - // and stats.max >= page.max for each page. - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - // should be half nulls, except no nulls in column 0 - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - ASSERT_TRUE(stats.null_count.has_value()); - EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2); - ASSERT_TRUE(ci.null_counts.has_value()); - - // schema indexing starts at 1 - auto const ptype = fmd.schema[c + 1].type; - auto const ctype = fmd.schema[c + 1].converted_type; - for (size_t p = 0; p < ci.min_values.size(); p++) { - EXPECT_FALSE(ci.null_pages[p]); - if (c > 0) { // first column has no nulls - EXPECT_GT(ci.null_counts.value()[p], 0); - } else { - EXPECT_EQ(ci.null_counts.value()[p], 0); - } - EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); - } - for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); - } - } - } -} - -TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) -{ - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 - : cudf::io::parquet::detail::PageType::DATA_PAGE; - - // fixed length strings - auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%012d", i); - return std::string(buf); - }); - auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); - - auto col1_data = random_values(num_rows); - auto col2_data = random_values(num_rows); - - // col1 is all nulls - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; }); - auto col1 = - cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end(), valids); - auto col2 = cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end()); - - // mixed length strings - auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - char buf[30]; - sprintf(buf, "%d", i); - return std::string(buf); - }); - auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); - - auto expected = table_view{{col0, col1, col2, col3}}; - - auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexNullColumn.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .write_v2_headers(is_v2) - .max_page_size_rows(20000); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - // loop over offsets, read each page header, make sure it's a data page and that - // the first row index is correct - auto const oi = read_offset_index(source, chunk); - - int64_t num_vals = 0; - for (size_t o = 0; o < oi.page_locations.size(); o++) { - auto const& page_loc = oi.page_locations[o]; - auto const ph = read_page_header(source, page_loc); - EXPECT_EQ(ph.type, expected_hdr_type); - EXPECT_EQ(page_loc.first_row_index, num_vals); - num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; - } - - // loop over page stats from the column index. check that stats.min <= page.min - // and stats.max >= page.max for each non-empty page. - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - // there should be no nulls except column 1 which is all nulls - if (c != 1) { - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - } - ASSERT_TRUE(stats.null_count.has_value()); - EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0); - ASSERT_TRUE(ci.null_counts.has_value()); - - // schema indexing starts at 1 - auto const ptype = fmd.schema[c + 1].type; - auto const ctype = fmd.schema[c + 1].converted_type; - for (size_t p = 0; p < ci.min_values.size(); p++) { - // check tnat null_pages is true for column 1 - if (c == 1) { - EXPECT_TRUE(ci.null_pages[p]); - EXPECT_GT(ci.null_counts.value()[p], 0); - } - if (not ci.null_pages[p]) { - EXPECT_EQ(ci.null_counts.value()[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); - } - } - for (size_t p = 0; p < ci.max_values.size(); p++) { - if (not ci.null_pages[p]) { - EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); - } - } - } - } -} - -TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) -{ - auto const is_v2 = GetParam(); - auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 - : cudf::io::parquet::detail::PageType::DATA_PAGE; - - auto c0 = testdata::ascending(); - - auto sc0 = testdata::ascending(); - auto sc1 = testdata::descending(); - auto sc2 = testdata::unordered(); - - std::vector> struct_children; - struct_children.push_back(sc0.release()); - struct_children.push_back(sc1.release()); - struct_children.push_back(sc2.release()); - cudf::test::structs_column_wrapper c1(std::move(struct_children)); - - auto listgen = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 2 == 0 ? i / 2 : num_ordered_rows - (i / 2); }); - auto list = - cudf::test::fixed_width_column_wrapper(listgen, listgen + 2 * num_ordered_rows); - auto offgen = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); - auto offsets = - cudf::test::fixed_width_column_wrapper(offgen, offgen + num_ordered_rows + 1); - - auto c2 = cudf::make_lists_column(num_ordered_rows, offsets.release(), list.release(), 0, {}); - - table_view expected({c0, c1, *c2}); - - auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexStruct.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .write_v2_headers(is_v2) - .max_page_size_rows(page_size_for_ordered_tests); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - // hard coded schema indices. - // TODO find a way to do this without magic - size_t const colidxs[] = {1, 3, 4, 5, 8}; - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - size_t colidx = colidxs[c]; - auto const& chunk = rg.columns[c]; - - // loop over offsets, read each page header, make sure it's a data page and that - // the first row index is correct - auto const oi = read_offset_index(source, chunk); - - int64_t num_vals = 0; - for (size_t o = 0; o < oi.page_locations.size(); o++) { - auto const& page_loc = oi.page_locations[o]; - auto const ph = read_page_header(source, page_loc); - EXPECT_EQ(ph.type, expected_hdr_type); - EXPECT_EQ(page_loc.first_row_index, num_vals); - // last column has 2 values per row - num_vals += is_v2 ? ph.data_page_header_v2.num_rows - : ph.data_page_header.num_values / (c == rg.columns.size() - 1 ? 2 : 1); - } - - // loop over page stats from the column index. check that stats.min <= page.min - // and stats.max >= page.max for each page. - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - - auto const ptype = fmd.schema[colidx].type; - auto const ctype = fmd.schema[colidx].converted_type; - for (size_t p = 0; p < ci.min_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); - } - for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); - } - } - } -} - -TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) -{ - auto const is_v2 = GetParam(); - auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 - : cudf::io::parquet::detail::PageType::DATA_PAGE; - - auto validity2 = - cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); - auto validity3 = cudf::detail::make_counting_transform_iterator( - 0, [](cudf::size_type i) { return (i % 3) != 0; }); - auto validity4 = cudf::detail::make_counting_transform_iterator( - 0, [](cudf::size_type i) { return (i % 4) != 0; }); - auto validity5 = cudf::detail::make_counting_transform_iterator( - 0, [](cudf::size_type i) { return (i % 5) != 0; }); - - auto c0 = testdata::ascending(); - - auto col1_data = random_values(num_ordered_rows); - auto col2_data = random_values(num_ordered_rows); - auto col3_data = random_values(num_ordered_rows); - - // col1 is all nulls - auto col1 = - cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end(), validity2); - auto col2 = - cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end(), validity3); - auto col3 = - cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end(), validity4); - - std::vector> struct_children; - struct_children.push_back(col1.release()); - struct_children.push_back(col2.release()); - struct_children.push_back(col3.release()); - auto struct_validity = std::vector(validity5, validity5 + num_ordered_rows); - cudf::test::structs_column_wrapper c1(std::move(struct_children), struct_validity); - table_view expected({c0, c1}); - - auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexStructNulls.parquet"); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .write_v2_headers(is_v2) - .max_page_size_rows(page_size_for_ordered_tests); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - // all struct columns will have num_ordered_rows / 5 nulls at level 0. - // col1 will have num_ordered_rows / 2 nulls total - // col2 will have num_ordered_rows / 3 nulls total - // col3 will have num_ordered_rows / 4 nulls total - int const null_mods[] = {0, 2, 3, 4}; - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - // loop over offsets, read each page header, make sure it's a data page and that - // the first row index is correct - auto const oi = read_offset_index(source, chunk); - auto const ci = read_column_index(source, chunk); - - // check definition level histogram (repetition will not be present) - if (c != 0) { - ASSERT_TRUE(chunk.meta_data.size_statistics.has_value()); - ASSERT_TRUE(chunk.meta_data.size_statistics->definition_level_histogram.has_value()); - // there are no lists so there should be no repetition level histogram - EXPECT_FALSE(chunk.meta_data.size_statistics->repetition_level_histogram.has_value()); - auto const& def_hist = chunk.meta_data.size_statistics->definition_level_histogram.value(); - ASSERT_TRUE(def_hist.size() == 3L); - auto const l0_nulls = num_ordered_rows / 5; - auto const l1_l0_nulls = num_ordered_rows / (5 * null_mods[c]); - auto const l1_nulls = num_ordered_rows / null_mods[c] - l1_l0_nulls; - auto const l2_vals = num_ordered_rows - l1_nulls - l0_nulls; - EXPECT_EQ(def_hist[0], l0_nulls); - EXPECT_EQ(def_hist[1], l1_nulls); - EXPECT_EQ(def_hist[2], l2_vals); - } else { - // column 0 has no lists and no nulls and no strings, so there should be no size stats - EXPECT_FALSE(chunk.meta_data.size_statistics.has_value()); - } - - int64_t num_vals = 0; - - if (is_v2) { ASSERT_TRUE(ci.null_counts.has_value()); } - for (size_t o = 0; o < oi.page_locations.size(); o++) { - auto const& page_loc = oi.page_locations[o]; - auto const ph = read_page_header(source, page_loc); - EXPECT_EQ(ph.type, expected_hdr_type); - EXPECT_EQ(page_loc.first_row_index, num_vals); - num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; - // check that null counts match - if (is_v2) { EXPECT_EQ(ci.null_counts.value()[o], ph.data_page_header_v2.num_nulls); } - } - } - } -} - -TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) -{ - auto const is_v2 = GetParam(); - auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 - : cudf::io::parquet::detail::PageType::DATA_PAGE; - - using cudf::test::iterators::null_at; - using cudf::test::iterators::nulls_at; - using lcw = cudf::test::lists_column_wrapper; - - // 4 nulls - // [NULL, 2, NULL] - // [] - // [4, 5] - // NULL - // def histogram [1, 1, 2, 3] - // rep histogram [4, 3] - lcw col0{{{{1, 2, 3}, nulls_at({0, 2})}, {}, {4, 5}, {}}, null_at(3)}; - - // 4 nulls - // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] - // [[7, 8]] - // [] - // [[]] - // def histogram [1, 3, 10] - // rep histogram [4, 4, 6] - lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; - - // 4 nulls - // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] - // [[7, 8]] - // [] - // [[]] - // def histogram [1, 1, 2, 10] - // rep histogram [4, 4, 6] - lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, lcw{}, lcw{lcw{}}}; - - // 6 nulls - // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] - // [[7, 8]] - // [] - // [[]] - // def histogram [1, 1, 2, 2, 8] - // rep histogram [4, 4, 6] - using dlcw = cudf::test::lists_column_wrapper; - dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, nulls_at({0, 2})}}, null_at(3)}, - {{7., 8.}}, - dlcw{}, - dlcw{dlcw{}}}; - - // 4 nulls - // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] - // [[7, 8]] - // [] - // NULL - // def histogram [1, 1, 1, 1, 10] - // rep histogram [4, 4, 6] - using ui16lcw = cudf::test::lists_column_wrapper; - cudf::test::lists_column_wrapper col4{ - {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}}, - null_at(3)}; - - // 6 nulls - // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] - // [[7, 8]] - // [] - // NULL - // def histogram [1, 1, 1, 1, 2, 8] - // rep histogram [4, 4, 6] - lcw col5{{{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, nulls_at({0, 2})}}, null_at(3)}, - {{7, 8}}, - lcw{}, - lcw{lcw{}}}, - null_at(3)}; - - // 4 nulls - // def histogram [1, 3, 9] - // rep histogram [4, 4, 5] - using strlcw = cudf::test::lists_column_wrapper; - cudf::test::lists_column_wrapper col6{ - {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}}, - {{"bee", "sting"}}, - strlcw{}, - strlcw{strlcw{}}}; - - // 5 nulls - // def histogram [1, 3, 1, 8] - // rep histogram [4, 4, 5] - using strlcw = cudf::test::lists_column_wrapper; - cudf::test::lists_column_wrapper col7{{{"Monday", "Monday", "Friday"}, - {}, - {{"Monday", "Friday"}, null_at(1)}, - {}, - {"Sunday", "Funday"}}, - {{"bee", "sting"}}, - strlcw{}, - strlcw{strlcw{}}}; - - // 11 nulls - // D 5 6 5 6 5 6 5 6 6 - // R 0 3 3 3 1 3 3 2 3 - // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] - // D 2 6 6 6 6 2 - // R 0 1 2 3 3 1 - // [NULL, [[13],[14,15,16]], NULL] - // D 2 3 2 4 - // R 0 1 1 1 - // [NULL, [], NULL, [[]]] - // D 0 - // R 0 - // NULL - // def histogram [1, 0, 4, 1, 1, 4, 9] - // rep histogram [4, 6, 2, 8] - lcw col8{{ - {{{{1, 2, 3, 4}, nulls_at({0, 2})}}, {{{5, 6, 7}, nulls_at({0, 2})}, {8, 9}}}, - {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, nulls_at({0, 2})}, - {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, nulls_at({0, 2})}, - lcw{lcw{lcw{}}}, - }, - null_at(3)}; - - table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); - - int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 5, 11}; - std::vector const expected_def_hists[] = {{1, 1, 2, 3}, - {1, 3, 10}, - {1, 1, 2, 10}, - {1, 1, 2, 2, 8}, - {1, 1, 1, 1, 10}, - {1, 1, 1, 1, 2, 8}, - {1, 3, 9}, - {1, 3, 1, 8}, - {1, 0, 4, 1, 1, 4, 9}}; - std::vector const expected_rep_hists[] = {{4, 3}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 5}, - {4, 4, 5}, - {4, 6, 2, 8}}; - - auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); - auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .write_v2_headers(is_v2) - .compression(cudf::io::compression_type::NONE); - - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - ASSERT_TRUE(chunk.meta_data.size_statistics.has_value()); - ASSERT_TRUE(chunk.meta_data.size_statistics->definition_level_histogram.has_value()); - ASSERT_TRUE(chunk.meta_data.size_statistics->repetition_level_histogram.has_value()); - // there is only one page, so chunk stats should match the page stats - EXPECT_EQ(chunk.meta_data.size_statistics->definition_level_histogram.value(), - expected_def_hists[c]); - EXPECT_EQ(chunk.meta_data.size_statistics->repetition_level_histogram.value(), - expected_rep_hists[c]); - // only column 6 has string data - if (c == 6) { - ASSERT_TRUE(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.has_value()); - EXPECT_EQ(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.value(), 50L); - } else if (c == 7) { - ASSERT_TRUE(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.has_value()); - EXPECT_EQ(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.value(), 44L); - } else { - EXPECT_FALSE(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.has_value()); - } - - // loop over offsets, read each page header, make sure it's a data page and that - // the first row index is correct - auto const oi = read_offset_index(source, chunk); - - for (size_t o = 0; o < oi.page_locations.size(); o++) { - auto const& page_loc = oi.page_locations[o]; - auto const ph = read_page_header(source, page_loc); - EXPECT_EQ(ph.type, expected_hdr_type); - // check null counts in V2 header - if (is_v2) { EXPECT_EQ(ph.data_page_header_v2.num_nulls, expected_null_counts[c]); } - } - - // check null counts in column chunk stats and page indexes - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - EXPECT_EQ(stats.null_count, expected_null_counts[c]); - - // should only be one page - EXPECT_FALSE(ci.null_pages[0]); - ASSERT_TRUE(ci.null_counts.has_value()); - EXPECT_EQ(ci.null_counts.value()[0], expected_null_counts[c]); - - ASSERT_TRUE(ci.definition_level_histogram.has_value()); - EXPECT_EQ(ci.definition_level_histogram.value(), expected_def_hists[c]); - - ASSERT_TRUE(ci.repetition_level_histogram.has_value()); - EXPECT_EQ(ci.repetition_level_histogram.value(), expected_rep_hists[c]); - - if (c == 6) { - ASSERT_TRUE(oi.unencoded_byte_array_data_bytes.has_value()); - EXPECT_EQ(oi.unencoded_byte_array_data_bytes.value()[0], 50L); - } else if (c == 7) { - ASSERT_TRUE(oi.unencoded_byte_array_data_bytes.has_value()); - EXPECT_EQ(oi.unencoded_byte_array_data_bytes.value()[0], 44L); - } else { - EXPECT_FALSE(oi.unencoded_byte_array_data_bytes.has_value()); - } - } - } -} - -TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) -{ - char const* coldata[] = { - // in-range 7 bit. should truncate to "yyyyyyyz" - "yyyyyyyyy", - // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's - // considered binary, not UTF-8. If UTF-8 it should not truncate. - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - // max binary. this should not truncate - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" - "ééééé", - // max 2-byte UTF8 (U+07FF). should not truncate - "߿߿߿߿߿", - // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" - "ࠀࠀࠀ", - // max 3-byte UTF8 (U+FFFF). should not truncate - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" - "𐀀𐀀𐀀", - // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, - // which is no longer valid unicode, but is still ok UTF-8??? - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - // max 4-byte UTF8 (U+1FFFFF). should not truncate - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; - - // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger - // than that will not become minimum value (when written as UTF-8). - char const* truncated_min[] = {"yyyyyyyy", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", - "\xf7\xbf\xbf\xbf", - "éééé", - "߿߿߿߿", - "ࠀࠀ", - "\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀀", - "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", - "\xf7\xbf\xbf\xbf"}; - - char const* truncated_max[] = {"yyyyyyyz", - "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", - "\xff\xff\xff\xff\xff\xff\xff\xff\xff", - "éééê", - "߿߿߿߿߿", - "ࠀࠁ", - "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", - "𐀀𐀁", - "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", - "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; - - auto cols = [&]() { - using string_wrapper = column_wrapper; - std::vector> cols; - for (auto const str : coldata) { - cols.push_back(string_wrapper{str}.release()); - } - return cols; - }(); - auto expected = std::make_unique
(std::move(cols)); - - auto const filepath = temp_env->get_temp_filepath("CheckColumnIndexTruncation.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected->view()) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .column_index_truncate_length(8); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - - // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max - auto const ptype = fmd.schema[c + 1].type; - auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); - - // check that truncated values == expected - EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0); - EXPECT_EQ(memcmp(ci.max_values[0].data(), truncated_max[c], ci.max_values[0].size()), 0); - } - } -} - -TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) -{ - std::vector truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, - {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; - - std::vector truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, - {0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; - - cudf::test::lists_column_wrapper col0{ - {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}}; - cudf::test::lists_column_wrapper col1{ - {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; - cudf::test::lists_column_wrapper col2{ - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; - - auto expected = table_view{{col0, col1, col2}}; - - cudf::io::table_input_metadata output_metadata(expected); - output_metadata.column_metadata[0].set_name("col_binary0").set_output_as_binary(true); - output_metadata.column_metadata[1].set_name("col_binary1").set_output_as_binary(true); - output_metadata.column_metadata[2].set_name("col_binary2").set_output_as_binary(true); - - auto const filepath = temp_env->get_temp_filepath("BinaryColumnIndexTruncation.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(output_metadata)) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .column_index_truncate_length(8); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - for (size_t r = 0; r < fmd.row_groups.size(); r++) { - auto const& rg = fmd.row_groups[r]; - for (size_t c = 0; c < rg.columns.size(); c++) { - auto const& chunk = rg.columns[c]; - - auto const ci = read_column_index(source, chunk); - auto const stats = get_statistics(chunk); - - // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max - auto const ptype = fmd.schema[c + 1].type; - auto const ctype = fmd.schema[c + 1].converted_type; - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); - - // check that truncated values == expected - EXPECT_EQ(ci.min_values[0], truncated_min[c]); - EXPECT_EQ(ci.max_values[0], truncated_max[c]); - } - } -} - -TEST_F(ParquetReaderTest, EmptyColumnsParam) -{ - srand(31337); - auto const expected = create_random_fixed_table(2, 4, false); - - std::vector out_buffer; - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, *expected); - cudf::io::write_parquet(args); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder( - cudf::io::source_info{out_buffer.data(), out_buffer.size()}) - .columns({}); - auto const result = cudf::io::read_parquet(read_opts); - - EXPECT_EQ(result.tbl->num_columns(), 0); - EXPECT_EQ(result.tbl->num_rows(), 0); -} - -TEST_F(ParquetReaderTest, BinaryAsStrings) -{ - std::vector strings{ - "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; - auto const num_rows = strings.size(); - - auto seq_col0 = random_values(num_rows); - auto seq_col2 = random_values(num_rows); - auto seq_col3 = random_values(num_rows); - auto validity = cudf::test::iterators::no_nulls(); - - column_wrapper int_col{seq_col0.begin(), seq_col0.end(), validity}; - column_wrapper string_col{strings.begin(), strings.end()}; - column_wrapper float_col{seq_col2.begin(), seq_col2.end(), validity}; - cudf::test::lists_column_wrapper list_int_col{ - {'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'M', 'o', 'n', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'u', 'n', 'd', 'a', 'y'}}; - - auto output = table_view{{int_col, string_col, float_col, string_col, list_int_col}}; - cudf::io::table_input_metadata output_metadata(output); - output_metadata.column_metadata[0].set_name("col_other"); - output_metadata.column_metadata[1].set_name("col_string"); - output_metadata.column_metadata[2].set_name("col_float"); - output_metadata.column_metadata[3].set_name("col_string2").set_output_as_binary(true); - output_metadata.column_metadata[4].set_name("col_binary").set_output_as_binary(true); - - auto filepath = temp_env->get_temp_filepath("BinaryReadStrings.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output) - .metadata(std::move(output_metadata)); - cudf::io::write_parquet(out_opts); - - auto expected_string = table_view{{int_col, string_col, float_col, string_col, string_col}}; - auto expected_mixed = table_view{{int_col, string_col, float_col, list_int_col, list_int_col}}; - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .set_column_schema({{}, {}, {}, {}, {}}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_string, result.tbl->view()); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_string, result.tbl->view()); - - std::vector md{ - {}, - {}, - {}, - cudf::io::reader_column_schema().set_convert_binary_to_strings(false), - cudf::io::reader_column_schema().set_convert_binary_to_strings(false)}; - - cudf::io::parquet_reader_options mixed_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .set_column_schema(md); - result = cudf::io::read_parquet(mixed_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_mixed, result.tbl->view()); -} - -TEST_F(ParquetReaderTest, NestedByteArray) -{ - constexpr auto num_rows = 8; - - auto seq_col0 = random_values(num_rows); - auto seq_col2 = random_values(num_rows); - auto seq_col3 = random_values(num_rows); - auto const validity = cudf::test::iterators::no_nulls(); - - column_wrapper int_col{seq_col0.begin(), seq_col0.end(), validity}; - column_wrapper float_col{seq_col2.begin(), seq_col2.end(), validity}; - cudf::test::lists_column_wrapper list_list_int_col{ - {{'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}}, - {{'M', 'o', 'n', 'd', 'a', 'y'}, {'F', 'r', 'i', 'd', 'a', 'y'}}, - {{'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}}, - {{'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'u', 'n', 'd', 'a', 'y'}}, - {{'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}}, - {{'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}, - {'F', 'u', 'n', 'd', 'a', 'y'}}, - {{'M', 'o', 'n', 'd', 'a', 'y'}, - {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, - {'F', 'r', 'i', 'd', 'a', 'y'}}, - {{'M', 'o', 'n', 'd', 'a', 'y'}, {'F', 'r', 'i', 'd', 'a', 'y'}}}; - - auto const expected = table_view{{int_col, float_col, list_list_int_col}}; - cudf::io::table_input_metadata output_metadata(expected); - output_metadata.column_metadata[0].set_name("col_other"); - output_metadata.column_metadata[1].set_name("col_float"); - output_metadata.column_metadata[2].set_name("col_binary").child(1).set_output_as_binary(true); - - auto filepath = temp_env->get_temp_filepath("NestedByteArray.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(output_metadata)); - cudf::io::write_parquet(out_opts); - - auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); - - std::vector md{ - {}, - {}, - cudf::io::reader_column_schema().add_child( - cudf::io::reader_column_schema().set_convert_binary_to_strings(false))}; - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .set_column_schema(md); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TEST_F(ParquetWriterTest, ByteArrayStats) -{ - // check that byte array min and max statistics are written as expected. If a byte array is - // written as a string, max utf8 is 0xf7bfbfbf and so the minimum value will be set to that value - // instead of a potential minimum higher than that. - std::vector expected_col0_min{0xf0}; - std::vector expected_col0_max{0xf0, 0xf5, 0xf5}; - std::vector expected_col1_min{0xfe, 0xfe, 0xfe}; - std::vector expected_col1_max{0xfe, 0xfe, 0xfe}; - - cudf::test::lists_column_wrapper list_int_col0{ - {0xf0}, {0xf0, 0xf5, 0xf3}, {0xf0, 0xf5, 0xf5}}; - cudf::test::lists_column_wrapper list_int_col1{ - {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}}; - - auto expected = table_view{{list_int_col0, list_int_col1}}; - cudf::io::table_input_metadata output_metadata(expected); - output_metadata.column_metadata[0].set_name("col_binary0").set_output_as_binary(true); - output_metadata.column_metadata[1].set_name("col_binary1").set_output_as_binary(true); - - auto filepath = temp_env->get_temp_filepath("ByteArrayStats.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(output_metadata)); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .set_column_schema({{}, {}}); - auto result = cudf::io::read_parquet(in_opts); - - auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - - EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); - EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); - - auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); - auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); - - EXPECT_EQ(expected_col0_min, stats0.min_value); - EXPECT_EQ(expected_col0_max, stats0.max_value); - EXPECT_EQ(expected_col1_min, stats1.min_value); - EXPECT_EQ(expected_col1_max, stats1.max_value); -} - -TEST_F(ParquetReaderTest, StructByteArray) -{ - constexpr auto num_rows = 100; - - auto seq_col0 = random_values(num_rows); - auto const validity = cudf::test::iterators::no_nulls(); - - column_wrapper int_col{seq_col0.begin(), seq_col0.end(), validity}; - cudf::test::lists_column_wrapper list_of_int{{seq_col0.begin(), seq_col0.begin() + 50}, - {seq_col0.begin() + 50, seq_col0.end()}}; - auto struct_col = cudf::test::structs_column_wrapper{{list_of_int}, validity}; - - auto const expected = table_view{{struct_col}}; - EXPECT_EQ(1, expected.num_columns()); - cudf::io::table_input_metadata output_metadata(expected); - output_metadata.column_metadata[0] - .set_name("struct_binary") - .child(0) - .set_name("a") - .set_output_as_binary(true); - - auto filepath = temp_env->get_temp_filepath("StructByteArray.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(output_metadata)); - cudf::io::write_parquet(out_opts); - - std::vector md{cudf::io::reader_column_schema().add_child( - cudf::io::reader_column_schema().set_convert_binary_to_strings(false))}; - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .set_column_schema(md); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TEST_F(ParquetReaderTest, NestingOptimizationTest) -{ - // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. - constexpr cudf::size_type num_nesting_levels = 16; - static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); - constexpr cudf::size_type rows_per_level = 2; - - constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; - auto value_iter = thrust::make_counting_iterator(0); - auto validity = - cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); - cudf::test::fixed_width_column_wrapper values(value_iter, value_iter + num_values, validity); - - // ~256k values with num_nesting_levels = 16 - int total_values_produced = num_values; - auto prev_col = values.release(); - for (int idx = 0; idx < num_nesting_levels; idx++) { - auto const depth = num_nesting_levels - idx; - auto const num_rows = (1 << (num_nesting_levels - idx)); - - auto offsets_iter = cudf::detail::make_counting_transform_iterator( - 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); - total_values_produced += (num_rows + 1); - - cudf::test::fixed_width_column_wrapper offsets(offsets_iter, - offsets_iter + num_rows + 1); - auto c = cudf::make_lists_column(num_rows, offsets.release(), std::move(prev_col), 0, {}); - prev_col = std::move(c); - } - auto const& expect = prev_col; - - auto filepath = temp_env->get_temp_filepath("NestingDecodeCache.parquet"); - cudf::io::parquet_writer_options opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table_view{{*expect}}); - cudf::io::write_parquet(opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expect, result.tbl->get_column(0)); -} - -TEST_F(ParquetWriterTest, SingleValueDictionaryTest) -{ - constexpr unsigned int expected_bits = 1; - constexpr unsigned int nrows = 1'000'000U; - - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return "a unique string value suffixed with 1"; }); - auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); - auto const expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("SingleValueDictionaryTest.parquet"); - // set row group size so that there will be only one row group - // no compression so we can easily read page data - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .compression(cudf::io::compression_type::NONE) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .row_group_size_rows(nrows); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - - // make sure dictionary was used - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - auto used_dict = [&fmd]() { - for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { - return true; - } - } - return false; - }; - EXPECT_TRUE(used_dict()); - - // and check that the correct number of bits was used - auto const oi = read_offset_index(source, fmd.row_groups[0].columns[0]); - auto const nbits = read_dict_bits(source, oi.page_locations[0]); - EXPECT_EQ(nbits, expected_bits); -} - -TEST_F(ParquetWriterTest, DictionaryNeverTest) -{ - constexpr unsigned int nrows = 1'000U; - - // only one value, so would normally use dictionary - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return "a unique string value suffixed with 1"; }); - auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); - auto const expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("DictionaryNeverTest.parquet"); - // no compression so we can easily read page data - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .compression(cudf::io::compression_type::NONE) - .dictionary_policy(cudf::io::dictionary_policy::NEVER); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - - // make sure dictionary was not used - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - auto used_dict = [&fmd]() { - for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { - return true; - } - } - return false; - }; - EXPECT_FALSE(used_dict()); -} - -TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) -{ - constexpr unsigned int nrows = 65'536U; - // cardinality is chosen to result in a dictionary > 1MB in size - constexpr unsigned int cardinality = 32'768U; - - // single value will have a small dictionary - auto elements0 = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return "a unique string value suffixed with 1"; }); - auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows); - - // high cardinality will have a large dictionary - auto elements1 = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { - return "a unique string value suffixed with " + std::to_string(i % cardinality); - }); - auto const col1 = cudf::test::strings_column_wrapper(elements1, elements1 + nrows); - - auto const expected = table_view{{col0, col1}}; - - auto const filepath = temp_env->get_temp_filepath("DictionaryAdaptiveTest.parquet"); - // no compression so we can easily read page data - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .compression(cudf::io::compression_type::ZSTD) - .dictionary_policy(cudf::io::dictionary_policy::ADAPTIVE); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - - // make sure dictionary was used as expected. col0 should use one, - // col1 should not. - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - auto used_dict = [&fmd](int col) { - for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { - return true; - } - } - return false; - }; - EXPECT_TRUE(used_dict(0)); - EXPECT_FALSE(used_dict(1)); -} - -TEST_F(ParquetWriterTest, DictionaryAlwaysTest) -{ - constexpr unsigned int nrows = 65'536U; - // cardinality is chosen to result in a dictionary > 1MB in size - constexpr unsigned int cardinality = 32'768U; - - // single value will have a small dictionary - auto elements0 = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return "a unique string value suffixed with 1"; }); - auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows); - - // high cardinality will have a large dictionary - auto elements1 = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { - return "a unique string value suffixed with " + std::to_string(i % cardinality); - }); - auto const col1 = cudf::test::strings_column_wrapper(elements1, elements1 + nrows); - - auto const expected = table_view{{col0, col1}}; - - auto const filepath = temp_env->get_temp_filepath("DictionaryAlwaysTest.parquet"); - // no compression so we can easily read page data - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .compression(cudf::io::compression_type::ZSTD) - .dictionary_policy(cudf::io::dictionary_policy::ALWAYS); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - - // make sure dictionary was used for both columns - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - auto used_dict = [&fmd](int col) { - for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { - return true; - } - } - return false; - }; - EXPECT_TRUE(used_dict(0)); - EXPECT_TRUE(used_dict(1)); -} - -TEST_F(ParquetWriterTest, DictionaryPageSizeEst) -{ - // one page - constexpr unsigned int nrows = 20'000U; - - // this test is creating a pattern of repeating then non-repeating values to trigger - // a "worst-case" for page size estimation in the presence of a dictionary. have confirmed - // that this fails for values over 16 in the final term of `max_RLE_page_size()`. - // The output of the iterator will be 'CCCCCRRRRRCCCCCRRRRR...` where 'C' is a changing - // value, and 'R' repeats. The encoder will turn this into a literal run of 8 values - // (`CCCCCRRR`) followed by a repeated run of 2 (`RR`). This pattern then repeats, getting - // as close as possible to a condition of repeated 8 value literal runs. - auto elements0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - if ((i / 5) % 2 == 1) { - return std::string("non-unique string"); - } else { - return "a unique string value suffixed with " + std::to_string(i); - } - }); - auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows); - - auto const expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("DictionaryPageSizeEst.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .compression(cudf::io::compression_type::ZSTD) - .dictionary_policy(cudf::io::dictionary_policy::ALWAYS); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TEST_P(ParquetSizedTest, DictionaryTest) -{ - unsigned int const cardinality = (1 << (GetParam() - 1)) + 1; - unsigned int const nrows = std::max(cardinality * 3 / 2, 3'000'000U); - - auto elements = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { - return "a unique string value suffixed with " + std::to_string(i % cardinality); - }); - auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); - auto const expected = table_view{{col0}}; - - auto const filepath = temp_env->get_temp_filepath("DictionaryTest.parquet"); - // set row group size so that there will be only one row group - // no compression so we can easily read page data - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .compression(cudf::io::compression_type::NONE) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .dictionary_policy(cudf::io::dictionary_policy::ALWAYS) - .row_group_size_rows(nrows) - .row_group_size_bytes(512 * 1024 * 1024); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); - - // make sure dictionary was used - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - auto used_dict = [&fmd]() { - for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { - return true; - } - } - return false; - }; - EXPECT_TRUE(used_dict()); - - // and check that the correct number of bits was used - auto const oi = read_offset_index(source, fmd.row_groups[0].columns[0]); - auto const nbits = read_dict_bits(source, oi.page_locations[0]); - EXPECT_EQ(nbits, GetParam()); -} - -TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes) -{ - using T = TypeParam; - - srand(31337); - auto table = create_random_fixed_table(5, 5, true); - - std::vector out_buffer; - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); - cudf::io::write_parquet(out_opts); - - { - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info( - cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()))); - auto const result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); - } - - { - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span( - reinterpret_cast(out_buffer.data()), out_buffer.size()))); - auto const result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view()); - } -} - -TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes) -{ - using T = TypeParam; - - srand(31337); - auto table = create_random_fixed_table(5, 5, true); - - std::vector out_buffer; - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table); - cudf::io::write_parquet(out_opts); - - auto full_table = cudf::concatenate(std::vector({*table, *table})); - - { - auto spans = std::vector>{ - cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), - cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; - cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); - auto const result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); - } - - { - auto spans = std::vector>{ - cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size()), - cudf::host_span(reinterpret_cast(out_buffer.data()), out_buffer.size())}; - cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info(cudf::host_span>(spans.data(), spans.size()))); - auto const result = cudf::io::read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view()); - } -} - -TEST_F(ParquetWriterTest, UserNullability) -{ - auto weight_col = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; - auto ages_col = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; - auto struct_col = cudf::test::structs_column_wrapper{weight_col, ages_col}; - - auto expected = table_view({struct_col}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_nullability(false); - expected_metadata.column_metadata[0].child(0).set_nullability(true); - - auto filepath = temp_env->get_temp_filepath("SingleWriteNullable.parquet"); - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(expected_metadata)); - cudf::io::write_parquet(write_opts); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(read_opts); - - EXPECT_FALSE(result.tbl->view().column(0).nullable()); - EXPECT_TRUE(result.tbl->view().column(0).child(0).nullable()); - EXPECT_FALSE(result.tbl->view().column(0).child(1).nullable()); -} - -TEST_F(ParquetWriterTest, UserNullabilityInvalid) -{ - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; }); - auto col = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}, valids}; - auto expected = table_view({col}); - - auto filepath = temp_env->get_temp_filepath("SingleWriteNullableInvalid.parquet"); - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - // Should work without the nullability option - EXPECT_NO_THROW(cudf::io::write_parquet(write_opts)); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_nullability(false); - write_opts.set_metadata(std::move(expected_metadata)); - // Can't write a column with nulls as not nullable - EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error); -} - -TEST_F(ParquetReaderTest, SingleLevelLists) -{ - unsigned char list_bytes[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, - 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, - 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, - 0x02, 0x19, 0x3c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x15, 0x02, 0x00, 0x35, 0x00, 0x18, 0x01, 0x66, 0x15, 0x02, 0x15, 0x06, 0x4c, 0x3c, 0x00, - 0x00, 0x00, 0x15, 0x02, 0x25, 0x04, 0x18, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x00, 0x16, 0x02, - 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, - 0x66, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x15, 0x00, 0x16, 0x04, 0x16, 0x56, 0x16, 0x56, 0x26, - 0x08, 0x3c, 0x18, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, - 0x28, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x1c, 0x15, - 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x56, 0x16, 0x02, 0x26, 0x08, 0x16, 0x56, - 0x14, 0x00, 0x00, 0x28, 0x13, 0x52, 0x41, 0x50, 0x49, 0x44, 0x53, 0x20, 0x53, 0x70, 0x61, 0x72, - 0x6b, 0x20, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x9f, 0x00, - 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - // read single level list reproducing parquet file - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)}); - auto table = cudf::io::read_parquet(read_opts); - - auto const c0 = table.tbl->get_column(0); - EXPECT_TRUE(c0.type().id() == cudf::type_id::LIST); - - auto const lc = cudf::lists_column_view(c0); - auto const child = lc.child(); - EXPECT_TRUE(child.type().id() == cudf::type_id::INT32); -} - -TEST_F(ParquetReaderTest, ChunkedSingleLevelLists) -{ - unsigned char list_bytes[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x28, 0x15, 0x28, 0x15, 0xa7, 0xce, 0x91, 0x8c, 0x06, - 0x1c, 0x15, 0x04, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, - 0x02, 0x02, 0x00, 0x00, 0x00, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, - 0x02, 0x19, 0x3c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x15, 0x02, 0x00, 0x35, 0x00, 0x18, 0x01, 0x66, 0x15, 0x02, 0x15, 0x06, 0x4c, 0x3c, 0x00, - 0x00, 0x00, 0x15, 0x02, 0x25, 0x04, 0x18, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x00, 0x16, 0x02, - 0x19, 0x1c, 0x19, 0x1c, 0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, - 0x66, 0x05, 0x61, 0x72, 0x72, 0x61, 0x79, 0x15, 0x00, 0x16, 0x04, 0x16, 0x56, 0x16, 0x56, 0x26, - 0x08, 0x3c, 0x18, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x16, 0x00, - 0x28, 0x04, 0x01, 0x00, 0x00, 0x00, 0x18, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x1c, 0x15, - 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x16, 0x56, 0x16, 0x02, 0x26, 0x08, 0x16, 0x56, - 0x14, 0x00, 0x00, 0x28, 0x13, 0x52, 0x41, 0x50, 0x49, 0x44, 0x53, 0x20, 0x53, 0x70, 0x61, 0x72, - 0x6b, 0x20, 0x50, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x19, 0x1c, 0x1c, 0x00, 0x00, 0x00, 0x9f, 0x00, - 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - auto reader = cudf::io::chunked_parquet_reader( - 1L << 31, - cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(list_bytes), sizeof(list_bytes)})); - int iterations = 0; - while (reader.has_next() && iterations < 10) { - auto chunk = reader.read_chunk(); - } - EXPECT_TRUE(iterations < 10); -} - -TEST_F(ParquetWriterTest, CompStats) -{ - auto table = create_random_fixed_table(1, 100000, true); - - auto const stats = std::make_shared(); - - std::vector unused_buffer; - cudf::io::parquet_writer_options opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}, table->view()) - .compression_statistics(stats); - cudf::io::write_parquet(opts); - - EXPECT_NE(stats->num_compressed_bytes(), 0); - EXPECT_EQ(stats->num_failed_bytes(), 0); - EXPECT_EQ(stats->num_skipped_bytes(), 0); - EXPECT_FALSE(std::isnan(stats->compression_ratio())); -} - -TEST_F(ParquetChunkedWriterTest, CompStats) -{ - auto table = create_random_fixed_table(1, 100000, true); - - auto const stats = std::make_shared(); - - std::vector unused_buffer; - cudf::io::chunked_parquet_writer_options opts = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}) - .compression_statistics(stats); - cudf::io::parquet_chunked_writer(opts).write(*table); - - EXPECT_NE(stats->num_compressed_bytes(), 0); - EXPECT_EQ(stats->num_failed_bytes(), 0); - EXPECT_EQ(stats->num_skipped_bytes(), 0); - EXPECT_FALSE(std::isnan(stats->compression_ratio())); - - auto const single_table_comp_stats = *stats; - cudf::io::parquet_chunked_writer(opts).write(*table); - - EXPECT_EQ(stats->compression_ratio(), single_table_comp_stats.compression_ratio()); - EXPECT_EQ(stats->num_compressed_bytes(), 2 * single_table_comp_stats.num_compressed_bytes()); - - EXPECT_EQ(stats->num_failed_bytes(), 0); - EXPECT_EQ(stats->num_skipped_bytes(), 0); -} - -void expect_compression_stats_empty(std::shared_ptr stats) -{ - EXPECT_EQ(stats->num_compressed_bytes(), 0); - EXPECT_EQ(stats->num_failed_bytes(), 0); - EXPECT_EQ(stats->num_skipped_bytes(), 0); - EXPECT_TRUE(std::isnan(stats->compression_ratio())); -} - -TEST_F(ParquetWriterTest, CompStatsEmptyTable) -{ - auto table_no_rows = create_random_fixed_table(20, 0, false); - - auto const stats = std::make_shared(); - - std::vector unused_buffer; - cudf::io::parquet_writer_options opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}, - table_no_rows->view()) - .compression_statistics(stats); - cudf::io::write_parquet(opts); - - expect_compression_stats_empty(stats); -} - -TEST_F(ParquetChunkedWriterTest, CompStatsEmptyTable) -{ - auto table_no_rows = create_random_fixed_table(20, 0, false); - - auto const stats = std::make_shared(); - - std::vector unused_buffer; - cudf::io::chunked_parquet_writer_options opts = - cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}) - .compression_statistics(stats); - cudf::io::parquet_chunked_writer(opts).write(*table_no_rows); - - expect_compression_stats_empty(stats); -} - -TEST_F(ParquetReaderTest, ReorderedReadMultipleFiles) -{ - constexpr auto num_rows = 50'000; - constexpr auto cardinality = 20'000; - - // table 1 - auto str1 = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return "cat " + std::to_string(i % cardinality); }); - auto cols1 = cudf::test::strings_column_wrapper(str1, str1 + num_rows); - - auto int1 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % cardinality; }); - auto coli1 = cudf::test::fixed_width_column_wrapper(int1, int1 + num_rows); - - auto const expected1 = table_view{{cols1, coli1}}; - auto const swapped1 = table_view{{coli1, cols1}}; - - auto const filepath1 = temp_env->get_temp_filepath("LargeReorderedRead1.parquet"); - auto out_opts1 = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath1}, expected1) - .compression(cudf::io::compression_type::NONE); - cudf::io::write_parquet(out_opts1); - - // table 2 - auto str2 = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return "dog " + std::to_string(i % cardinality); }); - auto cols2 = cudf::test::strings_column_wrapper(str2, str2 + num_rows); - - auto int2 = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return (i % cardinality) + cardinality; }); - auto coli2 = cudf::test::fixed_width_column_wrapper(int2, int2 + num_rows); - - auto const expected2 = table_view{{cols2, coli2}}; - auto const swapped2 = table_view{{coli2, cols2}}; - - auto const filepath2 = temp_env->get_temp_filepath("LargeReorderedRead2.parquet"); - auto out_opts2 = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath2}, expected2) - .compression(cudf::io::compression_type::NONE); - cudf::io::write_parquet(out_opts2); - - // read in both files swapping the columns - auto read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}}) - .columns({"_col1", "_col0"}); - auto result = cudf::io::read_parquet(read_opts); - auto sliced = cudf::slice(result.tbl->view(), {0, num_rows, num_rows, 2 * num_rows}); - CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[0], swapped1); - CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[1], swapped2); -} - -// Test fixture for metadata tests -struct ParquetMetadataReaderTest : public cudf::test::BaseFixture { - std::string print(cudf::io::parquet_column_schema schema, int depth = 0) - { - std::string child_str; - for (auto const& child : schema.children()) { - child_str += print(child, depth + 1); - } - return std::string(depth, ' ') + schema.name() + "\n" + child_str; - } -}; - -TEST_F(ParquetMetadataReaderTest, TestBasic) -{ - auto const num_rows = 1200; - - auto ints = random_values(num_rows); - auto floats = random_values(num_rows); - column_wrapper int_col(ints.begin(), ints.end()); - column_wrapper float_col(floats.begin(), floats.end()); - - table_view expected({int_col, float_col}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("int_col"); - expected_metadata.column_metadata[1].set_name("float_col"); - - auto filepath = temp_env->get_temp_filepath("MetadataTest.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(expected_metadata)); - cudf::io::write_parquet(out_opts); - - auto meta = read_parquet_metadata(cudf::io::source_info{filepath}); - EXPECT_EQ(meta.num_rows(), num_rows); - - std::string expected_schema = R"(schema - int_col - float_col -)"; - EXPECT_EQ(expected_schema, print(meta.schema().root())); - - EXPECT_EQ(meta.schema().root().name(), "schema"); - EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); - ASSERT_EQ(meta.schema().root().num_children(), 2); - - EXPECT_EQ(meta.schema().root().child(0).name(), "int_col"); - EXPECT_EQ(meta.schema().root().child(1).name(), "float_col"); -} - -TEST_F(ParquetMetadataReaderTest, TestNested) -{ - auto const num_rows = 1200; - auto const lists_per_row = 4; - auto const num_child_rows = num_rows * lists_per_row; - - auto keys = random_values(num_child_rows); - auto vals = random_values(num_child_rows); - column_wrapper keys_col(keys.begin(), keys.end()); - column_wrapper vals_col(vals.begin(), vals.end()); - auto s_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release(); - - std::vector row_offsets(num_rows + 1); - for (int idx = 0; idx < num_rows + 1; ++idx) { - row_offsets[idx] = idx * lists_per_row; - } - column_wrapper offsets(row_offsets.begin(), row_offsets.end()); - - auto list_col = - cudf::make_lists_column(num_rows, offsets.release(), std::move(s_col), 0, rmm::device_buffer{}); - - table_view expected({*list_col, *list_col}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("maps"); - expected_metadata.column_metadata[0].set_list_column_as_map(); - expected_metadata.column_metadata[1].set_name("lists"); - expected_metadata.column_metadata[1].child(1).child(0).set_name("int_field"); - expected_metadata.column_metadata[1].child(1).child(1).set_name("float_field"); - - auto filepath = temp_env->get_temp_filepath("MetadataTest.orc"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(expected_metadata)); - cudf::io::write_parquet(out_opts); - - auto meta = read_parquet_metadata(cudf::io::source_info{filepath}); - EXPECT_EQ(meta.num_rows(), num_rows); - - std::string expected_schema = R"(schema - maps - key_value - key - value - lists - list - element - int_field - float_field -)"; - EXPECT_EQ(expected_schema, print(meta.schema().root())); - - EXPECT_EQ(meta.schema().root().name(), "schema"); - EXPECT_EQ(meta.schema().root().type_kind(), - cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // struct - ASSERT_EQ(meta.schema().root().num_children(), 2); - - auto const& out_map_col = meta.schema().root().child(0); - EXPECT_EQ(out_map_col.name(), "maps"); - EXPECT_EQ(out_map_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // map - - ASSERT_EQ(out_map_col.num_children(), 1); - EXPECT_EQ(out_map_col.child(0).name(), "key_value"); // key_value (named in parquet writer) - ASSERT_EQ(out_map_col.child(0).num_children(), 2); - EXPECT_EQ(out_map_col.child(0).child(0).name(), "key"); // key (named in parquet writer) - EXPECT_EQ(out_map_col.child(0).child(1).name(), "value"); // value (named in parquet writer) - EXPECT_EQ(out_map_col.child(0).child(0).type_kind(), cudf::io::parquet::TypeKind::INT32); // int - EXPECT_EQ(out_map_col.child(0).child(1).type_kind(), - cudf::io::parquet::TypeKind::FLOAT); // float - - auto const& out_list_col = meta.schema().root().child(1); - EXPECT_EQ(out_list_col.name(), "lists"); - EXPECT_EQ(out_list_col.type_kind(), cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // list - // TODO repetition type? - ASSERT_EQ(out_list_col.num_children(), 1); - EXPECT_EQ(out_list_col.child(0).name(), "list"); // list (named in parquet writer) - ASSERT_EQ(out_list_col.child(0).num_children(), 1); - - auto const& out_list_struct_col = out_list_col.child(0).child(0); - EXPECT_EQ(out_list_struct_col.name(), "element"); // elements (named in parquet writer) - EXPECT_EQ(out_list_struct_col.type_kind(), - cudf::io::parquet::TypeKind::UNDEFINED_TYPE); // struct - ASSERT_EQ(out_list_struct_col.num_children(), 2); - - auto const& out_int_col = out_list_struct_col.child(0); - EXPECT_EQ(out_int_col.name(), "int_field"); - EXPECT_EQ(out_int_col.type_kind(), cudf::io::parquet::TypeKind::INT32); - - auto const& out_float_col = out_list_struct_col.child(1); - EXPECT_EQ(out_float_col.name(), "float_field"); - EXPECT_EQ(out_float_col.type_kind(), cudf::io::parquet::TypeKind::FLOAT); -} - -TEST_F(ParquetWriterTest, NoNullsAsNonNullable) -{ - auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - column_wrapper col{{1, 2, 3}, valids}; - table_view expected({col}); - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_nullability(false); - - auto filepath = temp_env->get_temp_filepath("NonNullable.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(expected_metadata)); - // Writer should be able to write a column without nulls as non-nullable - EXPECT_NO_THROW(cudf::io::write_parquet(out_opts)); -} - -TEST_F(ParquetReaderTest, FilterSimple) -{ - srand(31337); - auto written_table = create_random_fixed_table(9, 9, false); - - auto filepath = temp_env->get_temp_filepath("FilterSimple.parquet"); - cudf::io::parquet_writer_options args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *written_table); - cudf::io::write_parquet(args); - - // Filtering AST - table[0] < RAND_MAX/2 - auto literal_value = cudf::numeric_scalar(RAND_MAX / 2); - auto literal = cudf::ast::literal(literal_value); - auto col_ref_0 = cudf::ast::column_reference(0); - auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); - - auto predicate = cudf::compute_column(*written_table, filter_expression); - EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) - << "Predicate filter should return a boolean"; - auto expected = cudf::apply_boolean_mask(*written_table, *predicate); - // To make sure AST filters out some elements - EXPECT_LT(expected->num_rows(), written_table->num_rows()); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .filter(filter_expression); - auto result = cudf::io::read_parquet(read_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -auto create_parquet_with_stats(std::string const& filename) -{ - auto col0 = testdata::ascending(); - auto col1 = testdata::descending(); - auto col2 = testdata::unordered(); - - auto const expected = table_view{{col0, col1, col2}}; - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("col_uint32"); - expected_metadata.column_metadata[1].set_name("col_int64"); - expected_metadata.column_metadata[2].set_name("col_double"); - - auto const filepath = temp_env->get_temp_filepath(filename); - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(std::move(expected_metadata)) - .row_group_size_rows(8000) - .stats_level(cudf::io::statistics_freq::STATISTICS_ROWGROUP); - cudf::io::write_parquet(out_opts); - - std::vector> columns; - columns.push_back(col0.release()); - columns.push_back(col1.release()); - columns.push_back(col2.release()); - - return std::pair{cudf::table{std::move(columns)}, filepath}; -} - -TEST_F(ParquetReaderTest, FilterIdentity) -{ - auto [src, filepath] = create_parquet_with_stats("FilterIdentity.parquet"); - - // Filtering AST - identity function, always true. - auto literal_value = cudf::numeric_scalar(true); - auto literal = cudf::ast::literal(literal_value); - auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .filter(filter_expression); - auto result = cudf::io::read_parquet(read_opts); - - cudf::io::parquet_reader_options read_opts2 = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result2 = cudf::io::read_parquet(read_opts2); - - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *result2.tbl); -} - -TEST_F(ParquetReaderTest, FilterReferenceExpression) -{ - auto [src, filepath] = create_parquet_with_stats("FilterReferenceExpression.parquet"); - // Filtering AST - table[0] < 150 - auto literal_value = cudf::numeric_scalar(150); - auto literal = cudf::ast::literal(literal_value); - auto col_ref_0 = cudf::ast::column_reference(0); - auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); - - // Expected result - auto predicate = cudf::compute_column(src, filter_expression); - auto expected = cudf::apply_boolean_mask(src, *predicate); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .filter(filter_expression); - auto result = cudf::io::read_parquet(read_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -TEST_F(ParquetReaderTest, FilterNamedExpression) -{ - auto [src, filepath] = create_parquet_with_stats("NamedExpression.parquet"); - // Filtering AST - table["col_uint32"] < 150 - auto literal_value = cudf::numeric_scalar(150); - auto literal = cudf::ast::literal(literal_value); - auto col_name_0 = cudf::ast::column_name_reference("col_uint32"); - auto parquet_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); - auto col_ref_0 = cudf::ast::column_reference(0); - auto table_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); - - // Expected result - auto predicate = cudf::compute_column(src, table_filter); - auto expected = cudf::apply_boolean_mask(src, *predicate); - - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .filter(parquet_filter); - auto result = cudf::io::read_parquet(read_opts); - - // tests - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); -} - -// Test for Types - numeric, chrono, string. -template -struct ParquetReaderPredicatePushdownTest : public ParquetReaderTest {}; - -// These chrono types are not supported because parquet writer does not have a type to represent -// them. -using UnsupportedChronoTypes = - cudf::test::Types; -// Also fixed point types unsupported, because AST does not support them yet. -using SupportedTestTypes = cudf::test::RemoveIf, - cudf::test::ComparableTypes>; - -TYPED_TEST_SUITE(ParquetReaderPredicatePushdownTest, SupportedTestTypes); - -template -auto create_parquet_typed_with_stats(std::string const& filename) -{ - auto col0 = testdata::ascending(); - auto col1 = testdata::descending(); - auto col2 = testdata::unordered(); - - auto const written_table = table_view{{col0, col1, col2}}; - auto const filepath = temp_env->get_temp_filepath("FilterTyped.parquet"); - { - cudf::io::table_input_metadata expected_metadata(written_table); - expected_metadata.column_metadata[0].set_name("col0"); - expected_metadata.column_metadata[1].set_name("col1"); - expected_metadata.column_metadata[2].set_name("col2"); - - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) - .metadata(std::move(expected_metadata)) - .row_group_size_rows(8000); - cudf::io::write_parquet(out_opts); - } - - std::vector> columns; - columns.push_back(col0.release()); - columns.push_back(col1.release()); - columns.push_back(col2.release()); - - return std::pair{cudf::table{std::move(columns)}, filepath}; -} - -TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) -{ - using T = TypeParam; - - auto const [src, filepath] = create_parquet_typed_with_stats("FilterTyped.parquet"); - auto const written_table = src.view(); - - // Filtering AST - auto literal_value = []() { - if constexpr (cudf::is_timestamp()) { - // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds - return cudf::timestamp_scalar(T(typename T::duration(10000))); // i (0-20,000) - } else if constexpr (cudf::is_duration()) { - // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds - return cudf::duration_scalar(T(10000)); // i (0-20,000) - } else if constexpr (std::is_same_v) { - // table[0] < "000010000" - return cudf::string_scalar("000010000"); // i (0-20,000) - } else { - // table[0] < 0 or 100u - return cudf::numeric_scalar((100 - 100 * std::is_signed_v)); // i/100 (-100-100/ 0-200) - } - }(); - auto literal = cudf::ast::literal(literal_value); - auto col_name_0 = cudf::ast::column_name_reference("col0"); - auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); - auto col_ref_0 = cudf::ast::column_reference(0); - auto ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); - - // Expected result - auto predicate = cudf::compute_column(written_table, ref_filter); - EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) - << "Predicate filter should return a boolean"; - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - // Reading with Predicate Pushdown - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .filter(filter_expression); - auto result = cudf::io::read_parquet(read_opts); - auto result_table = result.tbl->view(); - - // tests - EXPECT_EQ(int(written_table.column(0).type().id()), int(result_table.column(0).type().id())) - << "col0 type mismatch"; - // To make sure AST filters out some elements - EXPECT_LT(expected->num_rows(), written_table.num_rows()); - EXPECT_EQ(result_table.num_rows(), expected->num_rows()); - EXPECT_EQ(result_table.num_columns(), expected->num_columns()); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); -} - -TEST_F(ParquetReaderTest, FilterMultiple1) -{ - using T = cudf::string_view; - - auto const [src, filepath] = create_parquet_typed_with_stats("FilterMultiple1.parquet"); - auto const written_table = src.view(); - - // Filtering AST - 10000 < table[0] < 12000 - std::string const low = "000010000"; - std::string const high = "000012000"; - auto lov = cudf::string_scalar(low, true); - auto hiv = cudf::string_scalar(high, true); - auto filter_col = cudf::ast::column_reference(0); - auto lo_lit = cudf::ast::literal(lov); - auto hi_lit = cudf::ast::literal(hiv); - auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit); - auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit); - auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); - - // Expected result - auto predicate = cudf::compute_column(written_table, expr_3); - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - auto si = cudf::io::source_info(filepath); - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_3); - auto table_with_metadata = cudf::io::read_parquet(builder); - auto result = table_with_metadata.tbl->view(); - - // tests - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); -} - -TEST_F(ParquetReaderTest, FilterMultiple2) -{ - // multiple conditions on same column. - using T = cudf::string_view; - - auto const [src, filepath] = create_parquet_typed_with_stats("FilterMultiple2.parquet"); - auto const written_table = src.view(); - // 0-8000, 8001-16000, 16001-20000 - - // Filtering AST - // (table[0] >= "000010000" AND table[0] < "000012000") OR - // (table[0] >= "000017000" AND table[0] < "000019000") - std::string const low1 = "000010000"; - std::string const high1 = "000012000"; - auto lov = cudf::string_scalar(low1, true); - auto hiv = cudf::string_scalar(high1, true); - auto filter_col = cudf::ast::column_reference(0); - auto lo_lit = cudf::ast::literal(lov); - auto hi_lit = cudf::ast::literal(hiv); - auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit); - auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit); - auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); - std::string const low2 = "000017000"; - std::string const high2 = "000019000"; - auto lov2 = cudf::string_scalar(low2, true); - auto hiv2 = cudf::string_scalar(high2, true); - auto lo_lit2 = cudf::ast::literal(lov2); - auto hi_lit2 = cudf::ast::literal(hiv2); - auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col, lo_lit2); - auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col, hi_lit2); - auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5); - auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6); - - // Expected result - auto predicate = cudf::compute_column(written_table, expr_7); - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - auto si = cudf::io::source_info(filepath); - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_7); - auto table_with_metadata = cudf::io::read_parquet(builder); - auto result = table_with_metadata.tbl->view(); - - // tests - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); -} - -TEST_F(ParquetReaderTest, FilterMultiple3) -{ - // multiple conditions with reference to multiple columns. - // index and name references mixed. - using T = uint32_t; - auto const [src, filepath] = create_parquet_typed_with_stats("FilterMultiple3.parquet"); - auto const written_table = src.view(); - - // Filtering AST - (table[0] >= 70 AND table[0] < 90) OR (table[1] >= 100 AND table[1] < 120) - // row groups min, max: - // table[0] 0-80, 81-160, 161-200. - // table[1] 200-121, 120-41, 40-0. - auto filter_col1 = cudf::ast::column_reference(0); - auto filter_col2 = cudf::ast::column_name_reference("col1"); - T constexpr low1 = 70; - T constexpr high1 = 90; - T constexpr low2 = 100; - T constexpr high2 = 120; - auto lov = cudf::numeric_scalar(low1, true); - auto hiv = cudf::numeric_scalar(high1, true); - auto lo_lit1 = cudf::ast::literal(lov); - auto hi_lit1 = cudf::ast::literal(hiv); - auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col1, lo_lit1); - auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, hi_lit1); - auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); - auto lov2 = cudf::numeric_scalar(low2, true); - auto hiv2 = cudf::numeric_scalar(high2, true); - auto lo_lit2 = cudf::ast::literal(lov2); - auto hi_lit2 = cudf::ast::literal(hiv2); - auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2, lo_lit2); - auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, hi_lit2); - auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5); - // expression to test - auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6); - - // Expected result - auto filter_col2_ref = cudf::ast::column_reference(1); - auto expr_4_ref = - cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2_ref, lo_lit2); - auto expr_5_ref = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2_ref, hi_lit2); - auto expr_6_ref = - cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4_ref, expr_5_ref); - auto expr_7_ref = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6_ref); - auto predicate = cudf::compute_column(written_table, expr_7_ref); - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - auto si = cudf::io::source_info(filepath); - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_7); - auto table_with_metadata = cudf::io::read_parquet(builder); - auto result = table_with_metadata.tbl->view(); - - // tests - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); -} - -TEST_F(ParquetReaderTest, FilterSupported) -{ - using T = uint32_t; - auto const [src, filepath] = create_parquet_typed_with_stats("FilterSupported.parquet"); - auto const written_table = src.view(); - - // Filtering AST - ((table[0] > 70 AND table[0] <= 90) OR (table[1] >= 100 AND table[1] < 120)) - // AND (table[1] != 110) - // row groups min, max: - // table[0] 0-80, 81-160, 161-200. - // table[1] 200-121, 120-41, 40-0. - auto filter_col1 = cudf::ast::column_reference(0); - auto filter_col2 = cudf::ast::column_reference(1); - T constexpr low1 = 70; - T constexpr high1 = 90; - T constexpr low2 = 100; - T constexpr high2 = 120; - T constexpr skip_value = 110; - auto lov = cudf::numeric_scalar(low1, true); - auto hiv = cudf::numeric_scalar(high1, true); - auto lo_lit1 = cudf::ast::literal(lov); - auto hi_lit1 = cudf::ast::literal(hiv); - auto expr_1 = cudf::ast::operation(cudf::ast::ast_operator::GREATER, filter_col1, lo_lit1); - auto expr_2 = cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, filter_col1, hi_lit1); - auto expr_3 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_1, expr_2); - auto lov2 = cudf::numeric_scalar(low2, true); - auto hiv2 = cudf::numeric_scalar(high2, true); - auto lo_lit2 = cudf::ast::literal(lov2); - auto hi_lit2 = cudf::ast::literal(hiv2); - auto expr_4 = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, filter_col2, lo_lit2); - auto expr_5 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, hi_lit2); - auto expr_6 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_4, expr_5); - auto expr_7 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, expr_3, expr_6); - auto skip_ov = cudf::numeric_scalar(skip_value, true); - auto skip_lit = cudf::ast::literal(skip_ov); - auto expr_8 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col2, skip_lit); - auto expr_9 = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_7, expr_8); - - // Expected result - auto predicate = cudf::compute_column(written_table, expr_9); - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - auto si = cudf::io::source_info(filepath); - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_9); - auto table_with_metadata = cudf::io::read_parquet(builder); - auto result = table_with_metadata.tbl->view(); - - // tests - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); -} - -TEST_F(ParquetReaderTest, FilterSupported2) -{ - using T = uint32_t; - constexpr auto num_rows = 4000; - auto elements0 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 2000; }); - auto elements1 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 1000; }); - auto elements2 = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 500; }); - auto col0 = cudf::test::fixed_width_column_wrapper(elements0, elements0 + num_rows); - auto col1 = cudf::test::fixed_width_column_wrapper(elements1, elements1 + num_rows); - auto col2 = cudf::test::fixed_width_column_wrapper(elements2, elements2 + num_rows); - auto const written_table = table_view{{col0, col1, col2}}; - auto const filepath = temp_env->get_temp_filepath("FilterSupported2.parquet"); - { - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) - .row_group_size_rows(1000); - cudf::io::write_parquet(out_opts); - } - auto si = cudf::io::source_info(filepath); - auto filter_col0 = cudf::ast::column_reference(0); - auto filter_col1 = cudf::ast::column_reference(1); - auto filter_col2 = cudf::ast::column_reference(2); - auto s_value = cudf::numeric_scalar(1, true); - auto lit_value = cudf::ast::literal(s_value); - - auto test_expr = [&](auto& expr) { - // Expected result - auto predicate = cudf::compute_column(written_table, expr); - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - // tests - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); - auto table_with_metadata = cudf::io::read_parquet(builder); - auto result = table_with_metadata.tbl->view(); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); - }; - - // row groups min, max: - // table[0] 0-0, 0-0, 1-1, 1-1 - // table[1] 0-0, 1-1, 2-2, 3-3 - // table[2] 0-1, 2-3, 4-5, 6-7 - - // Filtering AST - table[i] == 1 - { - auto expr0 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col0, lit_value); - test_expr(expr0); - - auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col1, lit_value); - test_expr(expr1); - - auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col2, lit_value); - test_expr(expr2); - } - // Filtering AST - table[i] != 1 - { - auto expr0 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col0, lit_value); - test_expr(expr0); - - auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col1, lit_value); - test_expr(expr1); - - auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col2, lit_value); - test_expr(expr2); - } -} - -// Error types - type mismatch, invalid column name, invalid literal type, invalid operator, -// non-bool filter output type. -TEST_F(ParquetReaderTest, FilterErrors) -{ - using T = uint32_t; - auto const [src, filepath] = create_parquet_typed_with_stats("FilterErrors.parquet"); - auto const written_table = src.view(); - auto si = cudf::io::source_info(filepath); - - // Filtering AST - invalid column index - { - auto filter_col1 = cudf::ast::column_reference(3); - T constexpr low = 100; - auto lov = cudf::numeric_scalar(low, true); - auto low_lot = cudf::ast::literal(lov); - auto expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); - - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); - EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); - } - - // Filtering AST - invalid column name - { - auto filter_col1 = cudf::ast::column_name_reference("col3"); - T constexpr low = 100; - auto lov = cudf::numeric_scalar(low, true); - auto low_lot = cudf::ast::literal(lov); - auto expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); - EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); - } - - // Filtering AST - incompatible literal type - { - auto filter_col1 = cudf::ast::column_name_reference("col0"); - auto filter_col2 = cudf::ast::column_reference(1); - int64_t constexpr low = 100; - auto lov = cudf::numeric_scalar(low, true); - auto low_lot = cudf::ast::literal(lov); - auto expr1 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); - auto expr2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col2, low_lot); - auto builder1 = cudf::io::parquet_reader_options::builder(si).filter(expr1); - EXPECT_THROW(cudf::io::read_parquet(builder1), cudf::logic_error); - - auto builder2 = cudf::io::parquet_reader_options::builder(si).filter(expr2); - EXPECT_THROW(cudf::io::read_parquet(builder2), cudf::logic_error); - } - - // Filtering AST - "table[0] + 110" is invalid filter expression - { - auto filter_col1 = cudf::ast::column_reference(0); - T constexpr add_value = 110; - auto add_v = cudf::numeric_scalar(add_value, true); - auto add_lit = cudf::ast::literal(add_v); - auto expr_8 = cudf::ast::operation(cudf::ast::ast_operator::ADD, filter_col1, add_lit); - - auto si = cudf::io::source_info(filepath); - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr_8); - EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); - - // Expected result throw to show that the filter expression is invalid, - // not a limitation of the parquet predicate pushdown. - auto predicate = cudf::compute_column(written_table, expr_8); - EXPECT_THROW(cudf::apply_boolean_mask(written_table, *predicate), cudf::logic_error); - } - - // Filtering AST - INT64(table[0] < 100) non-bool expression - { - auto filter_col1 = cudf::ast::column_reference(0); - T constexpr low = 100; - auto lov = cudf::numeric_scalar(low, true); - auto low_lot = cudf::ast::literal(lov); - auto bool_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, filter_col1, low_lot); - auto cast = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, bool_expr); - - auto builder = cudf::io::parquet_reader_options::builder(si).filter(cast); - EXPECT_THROW(cudf::io::read_parquet(builder), cudf::logic_error); - EXPECT_NO_THROW(cudf::compute_column(written_table, cast)); - auto predicate = cudf::compute_column(written_table, cast); - EXPECT_NE(predicate->view().type().id(), cudf::type_id::BOOL8); - } -} - -// Filter without stats information in file. -TEST_F(ParquetReaderTest, FilterNoStats) -{ - using T = uint32_t; - constexpr auto num_rows = 16000; - auto elements = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i / 1000; }); - auto col0 = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); - auto const written_table = table_view{{col0}}; - auto const filepath = temp_env->get_temp_filepath("FilterNoStats.parquet"); - { - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) - .row_group_size_rows(8000) - .stats_level(cudf::io::statistics_freq::STATISTICS_NONE); - cudf::io::write_parquet(out_opts); - } - auto si = cudf::io::source_info(filepath); - auto filter_col0 = cudf::ast::column_reference(0); - auto s_value = cudf::numeric_scalar(1, true); - auto lit_value = cudf::ast::literal(s_value); - - // row groups min, max: - // table[0] 0-0, 1-1, 2-2, 3-3 - // Filtering AST - table[0] > 1 - auto expr = cudf::ast::operation(cudf::ast::ast_operator::GREATER, filter_col0, lit_value); - - // Expected result - auto predicate = cudf::compute_column(written_table, expr); - auto expected = cudf::apply_boolean_mask(written_table, *predicate); - - // tests - auto builder = cudf::io::parquet_reader_options::builder(si).filter(expr); - auto table_with_metadata = cudf::io::read_parquet(builder); - auto result = table_with_metadata.tbl->view(); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result); -} - -// Filter for float column with NaN values -TEST_F(ParquetReaderTest, FilterFloatNAN) -{ - constexpr auto num_rows = 24000; - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [num_rows](auto i) { return i > num_rows / 2 ? NAN : i; }); - auto col0 = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); - auto col1 = cudf::test::fixed_width_column_wrapper(elements, elements + num_rows); - - auto const written_table = table_view{{col0, col1}}; - auto const filepath = temp_env->get_temp_filepath("FilterFloatNAN.parquet"); - { - const cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, written_table) - .row_group_size_rows(8000); - cudf::io::write_parquet(out_opts); - } - auto si = cudf::io::source_info(filepath); - auto filter_col0 = cudf::ast::column_reference(0); - auto filter_col1 = cudf::ast::column_reference(1); - auto s0_value = cudf::numeric_scalar(NAN, true); - auto lit0_value = cudf::ast::literal(s0_value); - auto s1_value = cudf::numeric_scalar(NAN, true); - auto lit1_value = cudf::ast::literal(s1_value); - - // row groups min, max: - // table[0] 0-0, 1-1, 2-2, 3-3 - // Filtering AST - table[0] == NAN, table[1] != NAN - auto expr_eq = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, filter_col0, lit0_value); - auto expr_neq = cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, filter_col1, lit1_value); - - // Expected result - auto predicate0 = cudf::compute_column(written_table, expr_eq); - auto expected0 = cudf::apply_boolean_mask(written_table, *predicate0); - auto predicate1 = cudf::compute_column(written_table, expr_neq); - auto expected1 = cudf::apply_boolean_mask(written_table, *predicate1); - - // tests - auto builder0 = cudf::io::parquet_reader_options::builder(si).filter(expr_eq); - auto table_with_metadata0 = cudf::io::read_parquet(builder0); - auto result0 = table_with_metadata0.tbl->view(); - auto builder1 = cudf::io::parquet_reader_options::builder(si).filter(expr_neq); - auto table_with_metadata1 = cudf::io::read_parquet(builder1); - auto result1 = table_with_metadata1.tbl->view(); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected0->view(), result0); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected1->view(), result1); -} - -TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow) -{ - using namespace cuda::std::chrono; - using namespace cudf::io; - - column_wrapper big_ts_col{ - sys_days{year{3023} / month{7} / day{14}} + 7h + 38min + 45s + 418688us, - sys_days{year{723} / month{3} / day{21}} + 14h + 20min + 13s + microseconds{781ms}}; - - table_view expected({big_ts_col}); - auto filepath = temp_env->get_temp_filepath("BigINT96Timestamp.parquet"); - - auto const out_opts = - parquet_writer_options::builder(sink_info{filepath}, expected).int96_timestamps(true).build(); - write_parquet(out_opts); - - auto const in_opts = parquet_reader_options::builder(source_info(filepath)) - .timestamp_type(cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS)) - .build(); - auto const result = read_parquet(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TEST_F(ParquetWriterTest, PreserveNullability) -{ - constexpr auto num_rows = 100; - - auto const col0_data = random_values(num_rows); - auto const col1_data = random_values(num_rows); - - auto const col0_validity = cudf::test::iterators::no_nulls(); - auto const col1_validity = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - - column_wrapper col0{col0_data.begin(), col0_data.end(), col0_validity}; - column_wrapper col1{col1_data.begin(), col1_data.end(), col1_validity}; - auto const col2 = make_parquet_list_list_col(0, num_rows, 5, 8, true); - - auto const expected = table_view{{col0, col1, *col2}}; - - cudf::io::table_input_metadata expected_metadata(expected); - expected_metadata.column_metadata[0].set_name("mandatory"); - expected_metadata.column_metadata[0].set_nullability(false); - expected_metadata.column_metadata[1].set_name("optional"); - expected_metadata.column_metadata[1].set_nullability(true); - expected_metadata.column_metadata[2].set_name("lists"); - expected_metadata.column_metadata[2].set_nullability(true); - // offsets is a cudf thing that's not part of the parquet schema so it won't have nullability set - expected_metadata.column_metadata[2].child(0).set_name("offsets"); - expected_metadata.column_metadata[2].child(1).set_name("element"); - expected_metadata.column_metadata[2].child(1).set_nullability(false); - expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets"); - expected_metadata.column_metadata[2].child(1).child(1).set_name("element"); - expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true); - - auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .metadata(expected_metadata); - - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options const in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(in_opts); - auto const read_metadata = cudf::io::table_input_metadata{result.metadata}; - - // test that expected_metadata matches read_metadata - std::function - compare_names_and_nullability = [&](auto lhs, auto rhs) { - EXPECT_EQ(lhs.get_name(), rhs.get_name()); - ASSERT_EQ(lhs.is_nullability_defined(), rhs.is_nullability_defined()); - if (lhs.is_nullability_defined()) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); } - ASSERT_EQ(lhs.num_children(), rhs.num_children()); - for (int i = 0; i < lhs.num_children(); ++i) { - compare_names_and_nullability(lhs.child(i), rhs.child(i)); - } - }; - - ASSERT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size()); - - for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) { - compare_names_and_nullability(expected_metadata.column_metadata[i], - read_metadata.column_metadata[i]); - } -} - -TEST_P(ParquetV2Test, CheckEncodings) -{ - using cudf::io::parquet::detail::Encoding; - constexpr auto num_rows = 100'000; - auto const is_v2 = GetParam(); - - auto const validity = cudf::test::iterators::no_nulls(); - // data should be PLAIN for v1, RLE for V2 - auto col0_data = - cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; }); - // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2 - auto col1_data = random_values(num_rows); - // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2 - auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); - - cudf::test::fixed_width_column_wrapper col0{col0_data, col0_data + num_rows, validity}; - column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; - column_wrapper col2{col2_data, col2_data + num_rows, validity}; - - auto expected = table_view{{col0, col1, col2}}; - - auto const filename = is_v2 ? "CheckEncodingsV2.parquet" : "CheckEncodingsV1.parquet"; - auto filepath = temp_env->get_temp_filepath(filename); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .max_page_size_rows(num_rows) - .write_v2_headers(is_v2); - cudf::io::write_parquet(out_opts); - - // make sure the expected encodings are present - auto contains = [](auto const& vec, auto const& enc) { - return std::find(vec.begin(), vec.end(), enc) != vec.end(); - }; - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - - read_footer(source, &fmd); - auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; - auto const& chunk1_enc = fmd.row_groups[0].columns[1].meta_data.encodings; - auto const& chunk2_enc = fmd.row_groups[0].columns[2].meta_data.encodings; - if (is_v2) { - // col0 should have RLE for rep/def and data - EXPECT_TRUE(chunk0_enc.size() == 1); - EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); - // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data - EXPECT_TRUE(chunk1_enc.size() == 2); - EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED)); - // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data - EXPECT_TRUE(chunk2_enc.size() == 3); - EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN)); - EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE_DICTIONARY)); - } else { - // col0 should have RLE for rep/def and PLAIN for data - EXPECT_TRUE(chunk0_enc.size() == 2); - EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk0_enc, Encoding::PLAIN)); - // col1 should have RLE for rep/def and PLAIN for data - EXPECT_TRUE(chunk1_enc.size() == 2); - EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); - // col2 should have RLE for rep/def and PLAIN_DICTIONARY for data and dict - EXPECT_TRUE(chunk2_enc.size() == 2); - EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN_DICTIONARY)); - } -} - -// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly. -// see definition of UnsupportedChronoTypes above. -using DeltaDecimalTypes = cudf::test::Types; -using DeltaBinaryTypes = - cudf::test::Concat; -using SupportedDeltaTestTypes = - cudf::test::RemoveIf, DeltaBinaryTypes>; -TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes); - -TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes) -{ - using T = TypeParam; - auto col0 = testdata::ascending(); - auto col1 = testdata::unordered(); - - auto const expected = table_view{{col0, col1}}; - - auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) - .write_v2_headers(true) - .dictionary_policy(cudf::io::dictionary_policy::NEVER); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); -} - -TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced) -{ - using T = TypeParam; - constexpr int num_rows = 4'000; - auto col0 = testdata::ascending(); - auto col1 = testdata::unordered(); - - auto const expected = table_view{{col0, col1}}; - auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows}); - ASSERT_EQ(expected_slice[0].num_rows(), num_rows); - - auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) - .write_v2_headers(true) - .dictionary_policy(cudf::io::dictionary_policy::NEVER); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); -} - -TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) -{ - using T = TypeParam; - - constexpr int num_slice = 4'000; - constexpr int num_rows = 32 * 1024; - - std::mt19937 gen(6542); - std::bernoulli_distribution bn(0.7f); - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); - auto values = thrust::make_counting_iterator(0); - - // list - constexpr int vals_per_row = 4; - auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); - cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, - c1_offset_iter + num_rows + 1); - cudf::test::fixed_width_column_wrapper c1_vals( - values, values + (num_rows * vals_per_row), valids); - auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); - - auto _c1 = cudf::make_lists_column( - num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask)); - auto c1 = cudf::purge_nonempty_nulls(*_c1); - - auto const expected = table_view{{*c1}}; - auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice}); - ASSERT_EQ(expected_slice[0].num_rows(), num_slice); - - auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) - .write_v2_headers(true) - .dictionary_policy(cudf::io::dictionary_policy::NEVER); - cudf::io::write_parquet(out_opts); - - cudf::io::parquet_reader_options in_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto result = cudf::io::read_parquet(in_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); -} - -TEST_F(ParquetWriterTest, EmptyMinStringStatistics) -{ - char const* const min_val = ""; - char const* const max_val = "zzz"; - std::vector strings{min_val, max_val, "pining", "for", "the", "fjords"}; - - column_wrapper string_col{strings.begin(), strings.end()}; - auto const output = table_view{{string_col}}; - auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet"); - cudf::io::parquet_writer_options out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output); - cudf::io::write_parquet(out_opts); - - auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::detail::FileMetaData fmd; - read_footer(source, &fmd); - - ASSERT_TRUE(fmd.row_groups.size() > 0); - ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0); - auto const& chunk = fmd.row_groups[0].columns[0]; - auto const stats = get_statistics(chunk); - - ASSERT_TRUE(stats.min_value.has_value()); - ASSERT_TRUE(stats.max_value.has_value()); - auto const min_value = std::string{reinterpret_cast(stats.min_value.value().data()), - stats.min_value.value().size()}; - auto const max_value = std::string{reinterpret_cast(stats.max_value.value().data()), - stats.max_value.value().size()}; - EXPECT_EQ(min_value, std::string(min_val)); - EXPECT_EQ(max_value, std::string(max_val)); -} - -TEST_F(ParquetReaderTest, RepeatedNoAnnotations) -{ - constexpr unsigned char repeated_bytes[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, - 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, - 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, - 0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02, - 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15, - 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15, - 0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01, - 0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00, - 0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15, - 0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02, - 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, - 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d, - 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02, - 0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00, - 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d, - 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10, - 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, - 0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, - 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, - 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, - 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c, - 0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69, - 0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, - 0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04, - 0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c, - 0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19, - 0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, - 0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda, - 0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, - 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, - 0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, - 0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, - 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, - 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, - 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16, - 0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76, - 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75, - 0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39, - 0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38, - 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, - 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - auto read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); - auto result = cudf::io::read_parquet(read_opts); - - EXPECT_EQ(result.tbl->view().column(0).size(), 6); - EXPECT_EQ(result.tbl->view().num_columns(), 2); - - column_wrapper col0{1, 2, 3, 4, 5, 6}; - column_wrapper child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}}; - cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}}; - auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}}; - - auto list_offsets_column = - cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 1, 2, 5}.release(); - auto num_list_rows = list_offsets_column->size() - 1; - - auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID); - cudf::set_null_mask(static_cast(mask.data()), 0, 2, false); - - auto list_col = cudf::make_lists_column( - num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask)); - - std::vector> struct_children; - struct_children.push_back(std::move(list_col)); - - auto outer_struct = - cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}}; - table_view expected{{col0, outer_struct}}; - - CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected); -} - -inline auto random_validity(std::mt19937& engine) -{ - static std::bernoulli_distribution bn(0.7f); - return cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(engine); }); -} - -template -std::unique_ptr make_parquet_list_col(std::mt19937& engine, - int num_rows, - int max_vals_per_row, - bool include_validity) -{ - std::vector row_sizes(num_rows); - - auto const min_values_per_row = include_validity ? 0 : 1; - std::uniform_int_distribution dist{min_values_per_row, max_vals_per_row}; - std::generate_n(row_sizes.begin(), num_rows, [&]() { return cudf::size_type{dist(engine)}; }); - - std::vector offsets(num_rows + 1); - std::exclusive_scan(row_sizes.begin(), row_sizes.end(), offsets.begin(), 0); - offsets[num_rows] = offsets[num_rows - 1] + row_sizes.back(); - - std::vector values = random_values(offsets[num_rows]); - cudf::test::fixed_width_column_wrapper offsets_col(offsets.begin(), - offsets.end()); - - if (include_validity) { - auto valids = random_validity(engine); - auto values_col = - cudf::test::fixed_width_column_wrapper(values.begin(), values.end(), valids); - auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); - - auto col = cudf::make_lists_column( - num_rows, offsets_col.release(), values_col.release(), null_count, std::move(null_mask)); - return cudf::purge_nonempty_nulls(*col); - } else { - auto values_col = cudf::test::fixed_width_column_wrapper(values.begin(), values.end()); - return cudf::make_lists_column(num_rows, - offsets_col.release(), - values_col.release(), - 0, - cudf::create_null_mask(num_rows, cudf::mask_state::ALL_VALID)); - } -} - -std::vector string_values(std::mt19937& engine, int num_rows, int max_string_len) -{ - static std::uniform_int_distribution char_dist{'a', 'z'}; - static std::uniform_int_distribution strlen_dist{1, max_string_len}; - - std::vector values(num_rows); - std::generate_n(values.begin(), values.size(), [&]() { - int str_len = strlen_dist(engine); - std::string res = ""; - for (int i = 0; i < str_len; i++) { - res += char_dist(engine); - } - return res; - }); - - return values; -} - -// make a random list column, with random string lengths of 0..max_string_len, -// and up to max_vals_per_row strings in each list. -std::unique_ptr make_parquet_string_list_col(std::mt19937& engine, - int num_rows, - int max_vals_per_row, - int max_string_len, - bool include_validity) -{ - auto const range_min = include_validity ? 0 : 1; - - std::uniform_int_distribution dist{range_min, max_vals_per_row}; - - std::vector row_sizes(num_rows); - std::generate_n(row_sizes.begin(), num_rows, [&]() { return cudf::size_type{dist(engine)}; }); - - std::vector offsets(num_rows + 1); - std::exclusive_scan(row_sizes.begin(), row_sizes.end(), offsets.begin(), 0); - offsets[num_rows] = offsets[num_rows - 1] + row_sizes.back(); - - std::uniform_int_distribution strlen_dist{range_min, max_string_len}; - auto const values = string_values(engine, offsets[num_rows], max_string_len); - - cudf::test::fixed_width_column_wrapper offsets_col(offsets.begin(), - offsets.end()); - - if (include_validity) { - auto valids = random_validity(engine); - auto values_col = cudf::test::strings_column_wrapper(values.begin(), values.end(), valids); - auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); - - auto col = cudf::make_lists_column( - num_rows, offsets_col.release(), values_col.release(), null_count, std::move(null_mask)); - return cudf::purge_nonempty_nulls(*col); - } else { - auto values_col = cudf::test::strings_column_wrapper(values.begin(), values.end()); - return cudf::make_lists_column(num_rows, - offsets_col.release(), - values_col.release(), - 0, - cudf::create_null_mask(num_rows, cudf::mask_state::ALL_VALID)); - } -} - -TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls) -{ - constexpr int num_rows = 50'000; - constexpr auto seed = 21337; - - std::mt19937 engine{seed}; - auto int32_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); - auto int32_list = make_parquet_list_col(engine, num_rows, 5, false); - auto int64_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); - auto int64_list = make_parquet_list_col(engine, num_rows, 5, false); - auto int16_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); - auto int16_list = make_parquet_list_col(engine, num_rows, 5, false); - auto int8_list_nulls = make_parquet_list_col(engine, num_rows, 5, true); - auto int8_list = make_parquet_list_col(engine, num_rows, 5, false); - - auto str_list_nulls = make_parquet_string_list_col(engine, num_rows, 5, 32, true); - auto str_list = make_parquet_string_list_col(engine, num_rows, 5, 32, false); - auto big_str_list_nulls = make_parquet_string_list_col(engine, num_rows, 5, 256, true); - auto big_str_list = make_parquet_string_list_col(engine, num_rows, 5, 256, false); - - auto int32_data = random_values(num_rows); - auto int64_data = random_values(num_rows); - auto int16_data = random_values(num_rows); - auto int8_data = random_values(num_rows); - auto str_data = string_values(engine, num_rows, 32); - auto big_str_data = string_values(engine, num_rows, 256); - - auto const validity = random_validity(engine); - auto const no_nulls = cudf::test::iterators::no_nulls(); - column_wrapper int32_nulls_col{int32_data.begin(), int32_data.end(), validity}; - column_wrapper int32_col{int32_data.begin(), int32_data.end(), no_nulls}; - column_wrapper int64_nulls_col{int64_data.begin(), int64_data.end(), validity}; - column_wrapper int64_col{int64_data.begin(), int64_data.end(), no_nulls}; - - auto str_col = cudf::test::strings_column_wrapper(str_data.begin(), str_data.end(), no_nulls); - auto str_col_nulls = cudf::purge_nonempty_nulls( - cudf::test::strings_column_wrapper(str_data.begin(), str_data.end(), validity)); - auto big_str_col = - cudf::test::strings_column_wrapper(big_str_data.begin(), big_str_data.end(), no_nulls); - auto big_str_col_nulls = cudf::purge_nonempty_nulls( - cudf::test::strings_column_wrapper(big_str_data.begin(), big_str_data.end(), validity)); - - cudf::table_view tbl({int32_col, int32_nulls_col, *int32_list, *int32_list_nulls, - int64_col, int64_nulls_col, *int64_list, *int64_list_nulls, - *int16_list, *int16_list_nulls, *int8_list, *int8_list_nulls, - str_col, *str_col_nulls, *str_list, *str_list_nulls, - big_str_col, *big_str_col_nulls, *big_str_list, *big_str_list_nulls}); - - auto const filepath = temp_env->get_temp_filepath("DeltaSkipRowsWithNulls.parquet"); - auto const out_opts = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .compression(cudf::io::compression_type::NONE) - .dictionary_policy(cudf::io::dictionary_policy::NEVER) - .max_page_size_rows(20'000) - .write_v2_headers(true) - .build(); - cudf::io::write_parquet(out_opts); - - // skip_rows / num_rows - // clang-format off - std::vector> params{ - // skip and then read rest of file - {-1, -1}, {1, -1}, {2, -1}, {32, -1}, {33, -1}, {128, -1}, {1000, -1}, - // no skip but read fewer rows - {0, 1}, {0, 2}, {0, 31}, {0, 32}, {0, 33}, {0, 128}, {0, 129}, {0, 130}, - // skip and truncate - {1, 32}, {1, 33}, {32, 32}, {33, 139}, - // cross page boundaries - {10'000, 20'000} - }; - - // clang-format on - for (auto p : params) { - cudf::io::parquet_reader_options read_args = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - if (p.first >= 0) { read_args.set_skip_rows(p.first); } - if (p.second >= 0) { read_args.set_num_rows(p.second); } - auto result = cudf::io::read_parquet(read_args); - - p.first = p.first < 0 ? 0 : p.first; - p.second = p.second < 0 ? num_rows - p.first : p.second; - std::vector slice_indices{p.first, p.first + p.second}; - std::vector expected = cudf::slice(tbl, slice_indices); - - CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected[0]); - - // test writing the result back out as a further check of the delta writer's correctness - std::vector out_buffer; - cudf::io::parquet_writer_options out_opts2 = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, - result.tbl->view()) - .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) - .compression(cudf::io::compression_type::NONE) - .dictionary_policy(cudf::io::dictionary_policy::NEVER) - .max_page_size_rows(20'000) - .write_v2_headers(true); - cudf::io::write_parquet(out_opts2); - - cudf::io::parquet_reader_options default_in_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{out_buffer.data(), out_buffer.size()}); - auto const result2 = cudf::io::read_parquet(default_in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), result2.tbl->view()); - } -} +// NOTE: this file exists to define the parquet test's `main()` function. +// `main()` is kept in its own compilation unit to keep the compilation time for +// PARQUET_TEST at a minimum. +// +// Do not add any test definitions to this file. CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp new file mode 100644 index 00000000000..f2b50639a4d --- /dev/null +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -0,0 +1,1528 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_common.hpp" + +#include +#include +#include +#include + +#include + +// Base test fixture for V2 header tests +class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam {}; + +INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest, + ParquetV2Test, + testing::Bool(), + testing::PrintToStringParamName()); + +TEST_P(ParquetV2Test, MultiColumn) +{ + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + + // auto col0_data = random_values(num_rows); + auto col1_data = random_values(num_rows); + auto col2_data = random_values(num_rows); + auto col3_data = random_values(num_rows); + auto col4_data = random_values(num_rows); + auto col5_data = random_values(num_rows); + auto col6_vals = random_values(num_rows); + auto col7_vals = random_values(num_rows); + auto col8_vals = random_values(num_rows); + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) { + return numeric::decimal32{col6_vals[i], numeric::scale_type{5}}; + }); + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) { + return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}}; + }); + auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col8_vals](auto i) { + return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}}; + }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + // column_wrapper col0{ + // col0_data.begin(), col0_data.end(), validity}; + column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; + column_wrapper col2{col2_data.begin(), col2_data.end(), validity}; + column_wrapper col3{col3_data.begin(), col3_data.end(), validity}; + column_wrapper col4{col4_data.begin(), col4_data.end(), validity}; + column_wrapper col5{col5_data.begin(), col5_data.end(), validity}; + column_wrapper col6{col6_data, col6_data + num_rows, validity}; + column_wrapper col7{col7_data, col7_data + num_rows, validity}; + column_wrapper col8{col8_data, col8_data + num_rows, validity}; + + auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}}; + + cudf::io::table_input_metadata expected_metadata(expected); + // expected_metadata.column_metadata[0].set_name( "bools"); + expected_metadata.column_metadata[0].set_name("int8s"); + expected_metadata.column_metadata[1].set_name("int16s"); + expected_metadata.column_metadata[2].set_name("int32s"); + expected_metadata.column_metadata[3].set_name("floats"); + expected_metadata.column_metadata[4].set_name("doubles"); + expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(10); + expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20); + expected_metadata.column_metadata[7].set_name("decimal128s").set_decimal_precision(40); + + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(is_v2) + .metadata(expected_metadata); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, MultiColumnWithNulls) +{ + constexpr auto num_rows = 100; + auto const is_v2 = GetParam(); + + // auto col0_data = random_values(num_rows); + auto col1_data = random_values(num_rows); + auto col2_data = random_values(num_rows); + auto col3_data = random_values(num_rows); + auto col4_data = random_values(num_rows); + auto col5_data = random_values(num_rows); + auto col6_vals = random_values(num_rows); + auto col7_vals = random_values(num_rows); + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) { + return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}}; + }); + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) { + return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}}; + }); + // auto col0_mask = cudf::detail::make_counting_transform_iterator( + // 0, [](auto i) { return (i % 2); }); + auto col1_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); }); + auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + auto col3_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); }); + auto col4_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); }); + auto col5_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); }); + auto col6_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 5); }); + auto col7_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i != 55); }); + + // column_wrapper col0{ + // col0_data.begin(), col0_data.end(), col0_mask}; + column_wrapper col1{col1_data.begin(), col1_data.end(), col1_mask}; + column_wrapper col2{col2_data.begin(), col2_data.end(), col2_mask}; + column_wrapper col3{col3_data.begin(), col3_data.end(), col3_mask}; + column_wrapper col4{col4_data.begin(), col4_data.end(), col4_mask}; + column_wrapper col5{col5_data.begin(), col5_data.end(), col5_mask}; + column_wrapper col6{col6_data, col6_data + num_rows, col6_mask}; + column_wrapper col7{col7_data, col7_data + num_rows, col7_mask}; + + auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}}; + + cudf::io::table_input_metadata expected_metadata(expected); + // expected_metadata.column_names.emplace_back("bools"); + expected_metadata.column_metadata[0].set_name("int8s"); + expected_metadata.column_metadata[1].set_name("int16s"); + expected_metadata.column_metadata[2].set_name("int32s"); + expected_metadata.column_metadata[3].set_name("floats"); + expected_metadata.column_metadata[4].set_name("doubles"); + expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(9); + expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20); + + auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(is_v2) + .metadata(expected_metadata); + + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + // TODO: Need to be able to return metadata in tree form from reader so they can be compared. + // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not + // have any tests for it c++ or python. + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, Strings) +{ + auto const is_v2 = GetParam(); + + std::vector strings{ + "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + auto const num_rows = strings.size(); + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper col1{strings.begin(), strings.end()}; + column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; + + auto expected = table_view{{col0, col1, col2}}; + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); + + auto filepath = temp_env->get_temp_filepath("Strings.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(is_v2) + .metadata(expected_metadata); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, StringsAsBinary) +{ + auto const is_v2 = GetParam(); + std::vector unicode_strings{ + "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + std::vector ascii_strings{ + "Monday", "Wednesday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + + column_wrapper col0{ascii_strings.begin(), ascii_strings.end()}; + column_wrapper col1{unicode_strings.begin(), unicode_strings.end()}; + column_wrapper col2{ascii_strings.begin(), ascii_strings.end()}; + cudf::test::lists_column_wrapper col3{{'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}; + cudf::test::lists_column_wrapper col4{ + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'W', 200, 133, 'd', 'n', 200, 133, 's', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'M', 'o', 'n', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'r', 'i', 'd', 'a', 'y'}, + {'F', 'u', 'n', 'd', 'a', 'y'}}; + + auto write_tbl = table_view{{col0, col1, col2, col3, col4}}; + + cudf::io::table_input_metadata expected_metadata(write_tbl); + expected_metadata.column_metadata[0].set_name("col_single").set_output_as_binary(true); + expected_metadata.column_metadata[1].set_name("col_string").set_output_as_binary(true); + expected_metadata.column_metadata[2].set_name("col_another").set_output_as_binary(true); + expected_metadata.column_metadata[3].set_name("col_binary"); + expected_metadata.column_metadata[4].set_name("col_binary2"); + + auto filepath = temp_env->get_temp_filepath("BinaryStrings.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, write_tbl) + .write_v2_headers(is_v2) + .dictionary_policy(cudf::io::dictionary_policy::NEVER) + .metadata(expected_metadata); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .set_column_schema( + {cudf::io::reader_column_schema().set_convert_binary_to_strings(false), + cudf::io::reader_column_schema().set_convert_binary_to_strings(false), + cudf::io::reader_column_schema().set_convert_binary_to_strings(false), + cudf::io::reader_column_schema().add_child(cudf::io::reader_column_schema()), + cudf::io::reader_column_schema().add_child(cudf::io::reader_column_schema())}); + auto result = cudf::io::read_parquet(in_opts); + auto expected = table_view{{col3, col4, col3, col3, col4}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, SlicedTable) +{ + // This test checks for writing zero copy, offsetted views into existing cudf tables + + std::vector strings{ + "Monday", "Wȅdnȅsday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"}; + auto const num_rows = strings.size(); + auto const is_v2 = GetParam(); + + auto seq_col0 = random_values(num_rows); + auto seq_col2 = random_values(num_rows); + auto validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); + + column_wrapper col0{seq_col0.begin(), seq_col0.end(), validity}; + column_wrapper col1{strings.begin(), strings.end()}; + column_wrapper col2{seq_col2.begin(), seq_col2.end(), validity}; + + using lcw = cudf::test::lists_column_wrapper; + lcw col3{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}}; + + // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] + // [NULL, [[13],[14,15,16]], NULL] + // [NULL, [], NULL, [[]]] + // NULL + // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] + // [NULL, [[13],[14,15,16]], NULL] + // [[[]]] + // [NULL, [], NULL, [[]]] + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + lcw col4{{ + {{{{1, 2, 3, 4}, valids}}, {{{5, 6, 7}, valids}, {8, 9}}}, + {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, valids}, + {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, valids}, + lcw{lcw{lcw{}}}, + {{{{1, 2, 3, 4}, valids}}, {{{5, 6, 7}, valids}, {8, 9}}}, + {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, valids}, + lcw{lcw{lcw{}}}, + {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, valids}, + }, + valids2}; + + // Struct column + auto ages_col = cudf::test::fixed_width_column_wrapper{ + {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}}; + + auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}}; + + // Struct/List mixed column + + // [] + // [NULL, 2, NULL] + // [4, 5] + // NULL + // [] + // [7, 8, 9] + // [10] + // [11, 12] + lcw land{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}, {10}, {11, 12}}, valids2}; + + // [] + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8], []] + // [[]] + // [[]] + // [[], [], []] + // [[10]] + // [[13, 14], [15]] + lcw flats{lcw{}, + {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, + {{7, 8}, {}}, + lcw{lcw{}}, + lcw{lcw{}}, + lcw{lcw{}, lcw{}, lcw{}}, + {lcw{10}}, + {{13, 14}, {15}}}; + + auto struct_1 = cudf::test::structs_column_wrapper{land, flats}; + auto is_human = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, true, false, true, false}}; + auto col6 = cudf::test::structs_column_wrapper{{is_human, struct_1}}; + + auto expected = table_view({col0, col1, col2, col3, col4, col5, col6}); + + // auto expected_slice = expected; + auto expected_slice = cudf::slice(expected, {2, static_cast(num_rows) - 1}); + + cudf::io::table_input_metadata expected_metadata(expected_slice); + expected_metadata.column_metadata[0].set_name("col_other"); + expected_metadata.column_metadata[1].set_name("col_string"); + expected_metadata.column_metadata[2].set_name("col_another"); + expected_metadata.column_metadata[3].set_name("col_list"); + expected_metadata.column_metadata[4].set_name("col_multi_level_list"); + expected_metadata.column_metadata[5].set_name("col_struct"); + expected_metadata.column_metadata[5].set_name("col_struct_list"); + expected_metadata.column_metadata[6].child(0).set_name("human?"); + expected_metadata.column_metadata[6].child(1).set_name("particulars"); + expected_metadata.column_metadata[6].child(1).child(0).set_name("land"); + expected_metadata.column_metadata[6].child(1).child(1).set_name("flats"); + + auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(is_v2) + .metadata(expected_metadata); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_slice, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, ListColumn) +{ + auto const is_v2 = GetParam(); + + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + + using lcw = cudf::test::lists_column_wrapper; + + // [NULL, 2, NULL] + // [] + // [4, 5] + // NULL + lcw col0{{{{1, 2, 3}, valids}, {}, {4, 5}, {}}, valids2}; + + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, valids2}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] + // [[7, 8]] + // [] + // [[]] + using dlcw = cudf::test::lists_column_wrapper; + dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, valids}}, valids2}, + {{7., 8.}}, + dlcw{}, + dlcw{dlcw{}}}; + + // TODO: uint16_t lists are not read properly in parquet reader + // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] + // [[7, 8]] + // [] + // NULL + // using ui16lcw = cudf::test::lists_column_wrapper; + // cudf::test::lists_column_wrapper col4{ + // {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, valids2}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}}, + // valids2}; + + // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] + // [[7, 8]] + // [] + // NULL + lcw col5{ + {{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, valids}}, valids2}, {{7, 8}}, lcw{}, lcw{lcw{}}}, + valids2}; + + using strlcw = cudf::test::lists_column_wrapper; + cudf::test::lists_column_wrapper col6{ + {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}}, + {{"bee", "sting"}}, + strlcw{}, + strlcw{strlcw{}}}; + + // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] + // [NULL, [[13],[14,15,16]], NULL] + // [NULL, [], NULL, [[]]] + // NULL + lcw col7{{ + {{{{1, 2, 3, 4}, valids}}, {{{5, 6, 7}, valids}, {8, 9}}}, + {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, valids}, + {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, valids}, + lcw{lcw{lcw{}}}, + }, + valids2}; + + table_view expected({col0, col1, col2, col3, /* col4, */ col5, col6, col7}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_list_int_0"); + expected_metadata.column_metadata[1].set_name("col_list_list_int_1"); + expected_metadata.column_metadata[2].set_name("col_list_list_int_nullable_2"); + expected_metadata.column_metadata[3].set_name("col_list_list_nullable_double_nullable_3"); + // expected_metadata.column_metadata[0].set_name("col_list_list_uint16_4"); + expected_metadata.column_metadata[4].set_name("col_list_nullable_list_nullable_int_nullable_5"); + expected_metadata.column_metadata[5].set_name("col_list_list_string_6"); + expected_metadata.column_metadata[6].set_name("col_list_list_list_7"); + + auto filepath = temp_env->get_temp_filepath("ListColumn.parquet"); + auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(is_v2) + .metadata(expected_metadata) + .compression(cudf::io::compression_type::NONE); + + cudf::io::write_parquet(out_opts); + + auto in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, StructOfList) +{ + auto const is_v2 = GetParam(); + + // Struct>, + // flats:List> + // > + // > + + auto weights_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto ages_col = + cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; }); + + using lcw = cudf::test::lists_column_wrapper; + + // [] + // [NULL, 2, NULL] + // [4, 5] + // NULL + // [] + // [7, 8, 9] + lcw land_unit{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}}, valids2}; + + // [] + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8], []] + // [[]] + // [[]] + // [[], [], []] + lcw flats{lcw{}, + {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, + {{7, 8}, {}}, + lcw{lcw{}}, + lcw{lcw{}}, + lcw{lcw{}, lcw{}, lcw{}}}; + + auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col, land_unit, flats}, + {1, 1, 1, 1, 0, 1}}; + + auto is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto struct_2 = + cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); + + auto expected = table_view({*struct_2}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("being"); + expected_metadata.column_metadata[0].child(0).set_name("human?"); + expected_metadata.column_metadata[0].child(1).set_name("particulars"); + expected_metadata.column_metadata[0].child(1).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(1).child(1).set_name("age"); + expected_metadata.column_metadata[0].child(1).child(2).set_name("land_unit"); + expected_metadata.column_metadata[0].child(1).child(3).set_name("flats"); + + auto filepath = temp_env->get_temp_filepath("StructOfList.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(is_v2) + .metadata(expected_metadata); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)); + auto const result = cudf::io::read_parquet(read_args); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, ListOfStruct) +{ + auto const is_v2 = GetParam(); + + // List + // > + // > + + auto weight_col = cudf::test::fixed_width_column_wrapper{1.1, 2.4, 5.3, 8.0, 9.6, 6.9}; + + auto ages_col = + cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto struct_1 = cudf::test::structs_column_wrapper{{weight_col, ages_col}, {1, 1, 1, 1, 0, 1}}; + + auto is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto struct_2 = + cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); + + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper{0, 2, 5, 5, 6}.release(); + auto num_list_rows = list_offsets_column->size() - 1; + + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), std::move(struct_2), 0, {}); + + auto expected = table_view({*list_col}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("family"); + expected_metadata.column_metadata[0].child(1).child(0).set_name("human?"); + expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars"); + expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight"); + expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age"); + + auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(is_v2) + .metadata(expected_metadata); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)); + auto const result = cudf::io::read_parquet(read_args); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_P(ParquetV2Test, PartitionedWriteEmptyPartitions) +{ + auto const is_v2 = GetParam(); + + auto source = create_random_fixed_table(4, 4, false); + + auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); + auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); + + auto partition1 = cudf::io::partition_info{1, 0}; + auto partition2 = cudf::io::partition_info{1, 0}; + + auto expected1 = + cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); + auto expected2 = + cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); + + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder( + cudf::io::sink_info(std::vector{filepath1, filepath2}), *source) + .partitions({partition1, partition2}) + .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(args); + + auto result1 = cudf::io::read_parquet( + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); + + auto result2 = cudf::io::read_parquet( + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); +} + +TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns) +{ + auto const is_v2 = GetParam(); + + auto source = create_random_fixed_table(0, 4, false); + + auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); + auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); + + auto partition1 = cudf::io::partition_info{1, 0}; + auto partition2 = cudf::io::partition_info{1, 0}; + + auto expected1 = + cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); + auto expected2 = + cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); + + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder( + cudf::io::sink_info(std::vector{filepath1, filepath2}), *source) + .partitions({partition1, partition2}) + .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(args); + + auto result1 = cudf::io::read_parquet( + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); + + auto result2 = cudf::io::read_parquet( + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); +} + +TEST_P(ParquetV2Test, LargeColumnIndex) +{ + // create a file large enough to be written in 2 batches (currently 1GB per batch) + // pick fragment size that num_rows is divisible by, so we'll get equal sized row groups + const std::string s1(1000, 'a'); + const std::string s2(1000, 'b'); + constexpr auto num_rows = 512 * 1024; + constexpr auto frag_size = num_rows / 128; + auto const is_v2 = GetParam(); + + auto col0_elements = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return (i < num_rows) ? s1 : s2; }); + auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + 2 * num_rows); + + auto const expected = table_view{{col0, col0}}; + + auto const filepath = temp_env->get_temp_filepath("LargeColumnIndex.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .compression(cudf::io::compression_type::NONE) + .dictionary_policy(cudf::io::dictionary_policy::NEVER) + .write_v2_headers(is_v2) + .max_page_fragment_size(frag_size) + .row_group_size_bytes(1024 * 1024 * 1024) + .row_group_size_rows(num_rows); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (auto const& rg : fmd.row_groups) { + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max + auto const ptype = fmd.schema[c + 1].type; + auto const ctype = fmd.schema[c + 1].converted_type; + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); + } + } +} + +TEST_P(ParquetV2Test, CheckColumnOffsetIndex) +{ + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; + + // fixed length strings + auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + char buf[30]; + sprintf(buf, "%012d", i); + return std::string(buf); + }); + auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); + + auto col1_data = random_values(num_rows); + auto col2_data = random_values(num_rows); + auto col3_data = random_values(num_rows); + auto col4_data = random_values(num_rows); + auto col5_data = random_values(num_rows); + auto col6_data = random_values(num_rows); + + auto col1 = cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end()); + auto col2 = cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end()); + auto col3 = cudf::test::fixed_width_column_wrapper(col3_data.begin(), col3_data.end()); + auto col4 = cudf::test::fixed_width_column_wrapper(col4_data.begin(), col4_data.end()); + auto col5 = cudf::test::fixed_width_column_wrapper(col5_data.begin(), col5_data.end()); + auto col6 = cudf::test::fixed_width_column_wrapper(col6_data.begin(), col6_data.end()); + + // mixed length strings + auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + char buf[30]; + sprintf(buf, "%d", i); + return std::string(buf); + }); + auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); + + auto const expected = table_view{{col0, col1, col2, col3, col4, col5, col6, col7}}; + + auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndex.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .write_v2_headers(is_v2) + .max_page_size_rows(20000); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + + int64_t num_vals = 0; + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, expected_hdr_type); + EXPECT_EQ(page_loc.first_row_index, num_vals); + num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; + } + + // loop over page stats from the column index. check that stats.min <= page.min + // and stats.max >= page.max for each page. + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + ASSERT_TRUE(ci.null_counts.has_value()); + + // schema indexing starts at 1 + auto const ptype = fmd.schema[c + 1].type; + auto const ctype = fmd.schema[c + 1].converted_type; + for (size_t p = 0; p < ci.min_values.size(); p++) { + // null_pages should always be false + EXPECT_FALSE(ci.null_pages[p]); + // null_counts should always be 0 + EXPECT_EQ(ci.null_counts.value()[p], 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); + } + for (size_t p = 0; p < ci.max_values.size(); p++) + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); + } + } +} + +TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) +{ + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; + + // fixed length strings + auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + char buf[30]; + sprintf(buf, "%012d", i); + return std::string(buf); + }); + auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); + + auto col1_data = random_values(num_rows); + auto col2_data = random_values(num_rows); + auto col3_data = random_values(num_rows); + auto col4_data = random_values(num_rows); + auto col5_data = random_values(num_rows); + auto col6_data = random_values(num_rows); + + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + // add null values for all but first column + auto col1 = + cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end(), valids); + auto col2 = + cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end(), valids); + auto col3 = + cudf::test::fixed_width_column_wrapper(col3_data.begin(), col3_data.end(), valids); + auto col4 = + cudf::test::fixed_width_column_wrapper(col4_data.begin(), col4_data.end(), valids); + auto col5 = + cudf::test::fixed_width_column_wrapper(col5_data.begin(), col5_data.end(), valids); + auto col6 = + cudf::test::fixed_width_column_wrapper(col6_data.begin(), col6_data.end(), valids); + + // mixed length strings + auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + char buf[30]; + sprintf(buf, "%d", i); + return std::string(buf); + }); + auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); + + auto expected = table_view{{col0, col1, col2, col3, col4, col5, col6, col7}}; + + auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexNulls.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .write_v2_headers(is_v2) + .max_page_size_rows(20000); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + + int64_t num_vals = 0; + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, expected_hdr_type); + EXPECT_EQ(page_loc.first_row_index, num_vals); + num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; + } + + // loop over page stats from the column index. check that stats.min <= page.min + // and stats.max >= page.max for each page. + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + // should be half nulls, except no nulls in column 0 + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2); + ASSERT_TRUE(ci.null_counts.has_value()); + + // schema indexing starts at 1 + auto const ptype = fmd.schema[c + 1].type; + auto const ctype = fmd.schema[c + 1].converted_type; + for (size_t p = 0; p < ci.min_values.size(); p++) { + EXPECT_FALSE(ci.null_pages[p]); + if (c > 0) { // first column has no nulls + EXPECT_GT(ci.null_counts.value()[p], 0); + } else { + EXPECT_EQ(ci.null_counts.value()[p], 0); + } + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); + } + for (size_t p = 0; p < ci.max_values.size(); p++) { + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); + } + } + } +} + +TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) +{ + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; + + // fixed length strings + auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + char buf[30]; + sprintf(buf, "%012d", i); + return std::string(buf); + }); + auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); + + auto col1_data = random_values(num_rows); + auto col2_data = random_values(num_rows); + + // col1 is all nulls + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; }); + auto col1 = + cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end(), valids); + auto col2 = cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end()); + + // mixed length strings + auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + char buf[30]; + sprintf(buf, "%d", i); + return std::string(buf); + }); + auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); + + auto expected = table_view{{col0, col1, col2, col3}}; + + auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexNullColumn.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .write_v2_headers(is_v2) + .max_page_size_rows(20000); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + + int64_t num_vals = 0; + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, expected_hdr_type); + EXPECT_EQ(page_loc.first_row_index, num_vals); + num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; + } + + // loop over page stats from the column index. check that stats.min <= page.min + // and stats.max >= page.max for each non-empty page. + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + // there should be no nulls except column 1 which is all nulls + if (c != 1) { + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + } + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0); + ASSERT_TRUE(ci.null_counts.has_value()); + + // schema indexing starts at 1 + auto const ptype = fmd.schema[c + 1].type; + auto const ctype = fmd.schema[c + 1].converted_type; + for (size_t p = 0; p < ci.min_values.size(); p++) { + // check tnat null_pages is true for column 1 + if (c == 1) { + EXPECT_TRUE(ci.null_pages[p]); + EXPECT_GT(ci.null_counts.value()[p], 0); + } + if (not ci.null_pages[p]) { + EXPECT_EQ(ci.null_counts.value()[p], 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); + } + } + for (size_t p = 0; p < ci.max_values.size(); p++) { + if (not ci.null_pages[p]) { + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); + } + } + } + } +} + +TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) +{ + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; + + auto c0 = testdata::ascending(); + + auto sc0 = testdata::ascending(); + auto sc1 = testdata::descending(); + auto sc2 = testdata::unordered(); + + std::vector> struct_children; + struct_children.push_back(sc0.release()); + struct_children.push_back(sc1.release()); + struct_children.push_back(sc2.release()); + cudf::test::structs_column_wrapper c1(std::move(struct_children)); + + auto listgen = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? i / 2 : num_ordered_rows - (i / 2); }); + auto list = + cudf::test::fixed_width_column_wrapper(listgen, listgen + 2 * num_ordered_rows); + auto offgen = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto offsets = + cudf::test::fixed_width_column_wrapper(offgen, offgen + num_ordered_rows + 1); + + auto c2 = cudf::make_lists_column(num_ordered_rows, offsets.release(), list.release(), 0, {}); + + table_view expected({c0, c1, *c2}); + + auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexStruct.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .write_v2_headers(is_v2) + .max_page_size_rows(page_size_for_ordered_tests); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + // hard coded schema indices. + // TODO find a way to do this without magic + size_t const colidxs[] = {1, 3, 4, 5, 8}; + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + size_t colidx = colidxs[c]; + auto const& chunk = rg.columns[c]; + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + + int64_t num_vals = 0; + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, expected_hdr_type); + EXPECT_EQ(page_loc.first_row_index, num_vals); + // last column has 2 values per row + num_vals += is_v2 ? ph.data_page_header_v2.num_rows + : ph.data_page_header.num_values / (c == rg.columns.size() - 1 ? 2 : 1); + } + + // loop over page stats from the column index. check that stats.min <= page.min + // and stats.max >= page.max for each page. + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + + auto const ptype = fmd.schema[colidx].type; + auto const ctype = fmd.schema[colidx].converted_type; + for (size_t p = 0; p < ci.min_values.size(); p++) { + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); + } + for (size_t p = 0; p < ci.max_values.size(); p++) { + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); + } + } + } +} + +TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) +{ + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; + + auto validity2 = + cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); + auto validity3 = cudf::detail::make_counting_transform_iterator( + 0, [](cudf::size_type i) { return (i % 3) != 0; }); + auto validity4 = cudf::detail::make_counting_transform_iterator( + 0, [](cudf::size_type i) { return (i % 4) != 0; }); + auto validity5 = cudf::detail::make_counting_transform_iterator( + 0, [](cudf::size_type i) { return (i % 5) != 0; }); + + auto c0 = testdata::ascending(); + + auto col1_data = random_values(num_ordered_rows); + auto col2_data = random_values(num_ordered_rows); + auto col3_data = random_values(num_ordered_rows); + + // col1 is all nulls + auto col1 = + cudf::test::fixed_width_column_wrapper(col1_data.begin(), col1_data.end(), validity2); + auto col2 = + cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end(), validity3); + auto col3 = + cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end(), validity4); + + std::vector> struct_children; + struct_children.push_back(col1.release()); + struct_children.push_back(col2.release()); + struct_children.push_back(col3.release()); + auto struct_validity = std::vector(validity5, validity5 + num_ordered_rows); + cudf::test::structs_column_wrapper c1(std::move(struct_children), struct_validity); + table_view expected({c0, c1}); + + auto const filepath = temp_env->get_temp_filepath("CheckColumnOffsetIndexStructNulls.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .write_v2_headers(is_v2) + .max_page_size_rows(page_size_for_ordered_tests); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + // all struct columns will have num_ordered_rows / 5 nulls at level 0. + // col1 will have num_ordered_rows / 2 nulls total + // col2 will have num_ordered_rows / 3 nulls total + // col3 will have num_ordered_rows / 4 nulls total + int const null_mods[] = {0, 2, 3, 4}; + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + auto const ci = read_column_index(source, chunk); + + // check definition level histogram (repetition will not be present) + if (c != 0) { + ASSERT_TRUE(chunk.meta_data.size_statistics.has_value()); + ASSERT_TRUE(chunk.meta_data.size_statistics->definition_level_histogram.has_value()); + // there are no lists so there should be no repetition level histogram + EXPECT_FALSE(chunk.meta_data.size_statistics->repetition_level_histogram.has_value()); + auto const& def_hist = chunk.meta_data.size_statistics->definition_level_histogram.value(); + ASSERT_TRUE(def_hist.size() == 3L); + auto const l0_nulls = num_ordered_rows / 5; + auto const l1_l0_nulls = num_ordered_rows / (5 * null_mods[c]); + auto const l1_nulls = num_ordered_rows / null_mods[c] - l1_l0_nulls; + auto const l2_vals = num_ordered_rows - l1_nulls - l0_nulls; + EXPECT_EQ(def_hist[0], l0_nulls); + EXPECT_EQ(def_hist[1], l1_nulls); + EXPECT_EQ(def_hist[2], l2_vals); + } else { + // column 0 has no lists and no nulls and no strings, so there should be no size stats + EXPECT_FALSE(chunk.meta_data.size_statistics.has_value()); + } + + int64_t num_vals = 0; + + if (is_v2) { ASSERT_TRUE(ci.null_counts.has_value()); } + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, expected_hdr_type); + EXPECT_EQ(page_loc.first_row_index, num_vals); + num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values; + // check that null counts match + if (is_v2) { EXPECT_EQ(ci.null_counts.value()[o], ph.data_page_header_v2.num_nulls); } + } + } + } +} + +TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) +{ + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; + + using cudf::test::iterators::null_at; + using cudf::test::iterators::nulls_at; + using lcw = cudf::test::lists_column_wrapper; + + // 4 nulls + // [NULL, 2, NULL] + // [] + // [4, 5] + // NULL + // def histogram [1, 1, 2, 3] + // rep histogram [4, 3] + lcw col0{{{{1, 2, 3}, nulls_at({0, 2})}, {}, {4, 5}, {}}, null_at(3)}; + + // 4 nulls + // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + // def histogram [1, 3, 10] + // rep histogram [4, 4, 6] + lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // 4 nulls + // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] + // [[7, 8]] + // [] + // [[]] + // def histogram [1, 1, 2, 10] + // rep histogram [4, 4, 6] + lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, lcw{}, lcw{lcw{}}}; + + // 6 nulls + // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] + // [[7, 8]] + // [] + // [[]] + // def histogram [1, 1, 2, 2, 8] + // rep histogram [4, 4, 6] + using dlcw = cudf::test::lists_column_wrapper; + dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, nulls_at({0, 2})}}, null_at(3)}, + {{7., 8.}}, + dlcw{}, + dlcw{dlcw{}}}; + + // 4 nulls + // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]] + // [[7, 8]] + // [] + // NULL + // def histogram [1, 1, 1, 1, 10] + // rep histogram [4, 4, 6] + using ui16lcw = cudf::test::lists_column_wrapper; + cudf::test::lists_column_wrapper col4{ + {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}}, + null_at(3)}; + + // 6 nulls + // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]] + // [[7, 8]] + // [] + // NULL + // def histogram [1, 1, 1, 1, 2, 8] + // rep histogram [4, 4, 6] + lcw col5{{{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, nulls_at({0, 2})}}, null_at(3)}, + {{7, 8}}, + lcw{}, + lcw{lcw{}}}, + null_at(3)}; + + // 4 nulls + // def histogram [1, 3, 9] + // rep histogram [4, 4, 5] + using strlcw = cudf::test::lists_column_wrapper; + cudf::test::lists_column_wrapper col6{ + {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}}, + {{"bee", "sting"}}, + strlcw{}, + strlcw{strlcw{}}}; + + // 5 nulls + // def histogram [1, 3, 1, 8] + // rep histogram [4, 4, 5] + using strlcw = cudf::test::lists_column_wrapper; + cudf::test::lists_column_wrapper col7{{{"Monday", "Monday", "Friday"}, + {}, + {{"Monday", "Friday"}, null_at(1)}, + {}, + {"Sunday", "Funday"}}, + {{"bee", "sting"}}, + strlcw{}, + strlcw{strlcw{}}}; + + // 11 nulls + // D 5 6 5 6 5 6 5 6 6 + // R 0 3 3 3 1 3 3 2 3 + // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]] + // D 2 6 6 6 6 2 + // R 0 1 2 3 3 1 + // [NULL, [[13],[14,15,16]], NULL] + // D 2 3 2 4 + // R 0 1 1 1 + // [NULL, [], NULL, [[]]] + // D 0 + // R 0 + // NULL + // def histogram [1, 0, 4, 1, 1, 4, 9] + // rep histogram [4, 6, 2, 8] + lcw col8{{ + {{{{1, 2, 3, 4}, nulls_at({0, 2})}}, {{{5, 6, 7}, nulls_at({0, 2})}, {8, 9}}}, + {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, nulls_at({0, 2})}, + {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, nulls_at({0, 2})}, + lcw{lcw{lcw{}}}, + }, + null_at(3)}; + + table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); + + int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 5, 11}; + std::vector const expected_def_hists[] = {{1, 1, 2, 3}, + {1, 3, 10}, + {1, 1, 2, 10}, + {1, 1, 2, 2, 8}, + {1, 1, 1, 1, 10}, + {1, 1, 1, 1, 2, 8}, + {1, 3, 9}, + {1, 3, 1, 8}, + {1, 0, 4, 1, 1, 4, 9}}; + std::vector const expected_rep_hists[] = {{4, 3}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 5}, + {4, 4, 5}, + {4, 6, 2, 8}}; + + auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); + auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .write_v2_headers(is_v2) + .compression(cudf::io::compression_type::NONE); + + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + ASSERT_TRUE(chunk.meta_data.size_statistics.has_value()); + ASSERT_TRUE(chunk.meta_data.size_statistics->definition_level_histogram.has_value()); + ASSERT_TRUE(chunk.meta_data.size_statistics->repetition_level_histogram.has_value()); + // there is only one page, so chunk stats should match the page stats + EXPECT_EQ(chunk.meta_data.size_statistics->definition_level_histogram.value(), + expected_def_hists[c]); + EXPECT_EQ(chunk.meta_data.size_statistics->repetition_level_histogram.value(), + expected_rep_hists[c]); + // only column 6 has string data + if (c == 6) { + ASSERT_TRUE(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.has_value()); + EXPECT_EQ(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.value(), 50L); + } else if (c == 7) { + ASSERT_TRUE(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.has_value()); + EXPECT_EQ(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.value(), 44L); + } else { + EXPECT_FALSE(chunk.meta_data.size_statistics->unencoded_byte_array_data_bytes.has_value()); + } + + // loop over offsets, read each page header, make sure it's a data page and that + // the first row index is correct + auto const oi = read_offset_index(source, chunk); + + for (size_t o = 0; o < oi.page_locations.size(); o++) { + auto const& page_loc = oi.page_locations[o]; + auto const ph = read_page_header(source, page_loc); + EXPECT_EQ(ph.type, expected_hdr_type); + // check null counts in V2 header + if (is_v2) { EXPECT_EQ(ph.data_page_header_v2.num_nulls, expected_null_counts[c]); } + } + + // check null counts in column chunk stats and page indexes + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + EXPECT_EQ(stats.null_count, expected_null_counts[c]); + + // should only be one page + EXPECT_FALSE(ci.null_pages[0]); + ASSERT_TRUE(ci.null_counts.has_value()); + EXPECT_EQ(ci.null_counts.value()[0], expected_null_counts[c]); + + ASSERT_TRUE(ci.definition_level_histogram.has_value()); + EXPECT_EQ(ci.definition_level_histogram.value(), expected_def_hists[c]); + + ASSERT_TRUE(ci.repetition_level_histogram.has_value()); + EXPECT_EQ(ci.repetition_level_histogram.value(), expected_rep_hists[c]); + + if (c == 6) { + ASSERT_TRUE(oi.unencoded_byte_array_data_bytes.has_value()); + EXPECT_EQ(oi.unencoded_byte_array_data_bytes.value()[0], 50L); + } else if (c == 7) { + ASSERT_TRUE(oi.unencoded_byte_array_data_bytes.has_value()); + EXPECT_EQ(oi.unencoded_byte_array_data_bytes.value()[0], 44L); + } else { + EXPECT_FALSE(oi.unencoded_byte_array_data_bytes.has_value()); + } + } + } +} + +TEST_P(ParquetV2Test, CheckEncodings) +{ + using cudf::io::parquet::detail::Encoding; + constexpr auto num_rows = 100'000; + auto const is_v2 = GetParam(); + + auto const validity = cudf::test::iterators::no_nulls(); + // data should be PLAIN for v1, RLE for V2 + auto col0_data = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; }); + // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2 + auto col1_data = random_values(num_rows); + // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2 + auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + + cudf::test::fixed_width_column_wrapper col0{col0_data, col0_data + num_rows, validity}; + column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; + column_wrapper col2{col2_data, col2_data + num_rows, validity}; + + auto expected = table_view{{col0, col1, col2}}; + + auto const filename = is_v2 ? "CheckEncodingsV2.parquet" : "CheckEncodingsV1.parquet"; + auto filepath = temp_env->get_temp_filepath(filename); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_size_rows(num_rows) + .write_v2_headers(is_v2); + cudf::io::write_parquet(out_opts); + + // make sure the expected encodings are present + auto contains = [](auto const& vec, auto const& enc) { + return std::find(vec.begin(), vec.end(), enc) != vec.end(); + }; + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; + auto const& chunk1_enc = fmd.row_groups[0].columns[1].meta_data.encodings; + auto const& chunk2_enc = fmd.row_groups[0].columns[2].meta_data.encodings; + if (is_v2) { + // col0 should have RLE for rep/def and data + EXPECT_TRUE(chunk0_enc.size() == 1); + EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); + // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data + EXPECT_TRUE(chunk1_enc.size() == 2); + EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED)); + // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data + EXPECT_TRUE(chunk2_enc.size() == 3); + EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN)); + EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE_DICTIONARY)); + } else { + // col0 should have RLE for rep/def and PLAIN for data + EXPECT_TRUE(chunk0_enc.size() == 2); + EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk0_enc, Encoding::PLAIN)); + // col1 should have RLE for rep/def and PLAIN for data + EXPECT_TRUE(chunk1_enc.size() == 2); + EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); + // col2 should have RLE for rep/def and PLAIN_DICTIONARY for data and dict + EXPECT_TRUE(chunk2_enc.size() == 2); + EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN_DICTIONARY)); + } +} diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp new file mode 100644 index 00000000000..51190b5de9e --- /dev/null +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -0,0 +1,1720 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "parquet_common.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +template +void test_durations(mask_op_t mask_op) +{ + std::default_random_engine generator; + std::uniform_int_distribution distribution_d(0, 30); + auto sequence_d = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return distribution_d(generator); }); + + std::uniform_int_distribution distribution_s(0, 86400); + auto sequence_s = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return distribution_s(generator); }); + + std::uniform_int_distribution distribution(0, 86400 * 1000); + auto sequence = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return distribution(generator); }); + + auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op); + + constexpr auto num_rows = 100; + // Durations longer than a day are not exactly valid, but cudf should be able to round trip + auto durations_d = cudf::test::fixed_width_column_wrapper( + sequence_d, sequence_d + num_rows, mask); + auto durations_s = cudf::test::fixed_width_column_wrapper( + sequence_s, sequence_s + num_rows, mask); + auto durations_ms = cudf::test::fixed_width_column_wrapper( + sequence, sequence + num_rows, mask); + auto durations_us = cudf::test::fixed_width_column_wrapper( + sequence, sequence + num_rows, mask); + auto durations_ns = cudf::test::fixed_width_column_wrapper( + sequence, sequence + num_rows, mask); + + auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}}; + + auto filepath = temp_env->get_temp_filepath("Durations.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + auto durations_d_got = + cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view()); + + auto durations_s_got = + cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4)); +} + +TEST_F(ParquetWriterTest, Durations) +{ + test_durations([](auto i) { return true; }); + test_durations([](auto i) { return (i % 2) != 0; }); + test_durations([](auto i) { return (i % 3) != 0; }); + test_durations([](auto i) { return false; }); +} + +TEST_F(ParquetWriterTest, MultiIndex) +{ + constexpr auto num_rows = 100; + + auto col0_data = random_values(num_rows); + auto col1_data = random_values(num_rows); + auto col2_data = random_values(num_rows); + auto col3_data = random_values(num_rows); + auto col4_data = random_values(num_rows); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + column_wrapper col0{col0_data.begin(), col0_data.end(), validity}; + column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; + column_wrapper col2{col2_data.begin(), col2_data.end(), validity}; + column_wrapper col3{col3_data.begin(), col3_data.end(), validity}; + column_wrapper col4{col4_data.begin(), col4_data.end(), validity}; + + auto expected = table_view{{col0, col1, col2, col3, col4}}; + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("int8s"); + expected_metadata.column_metadata[1].set_name("int16s"); + expected_metadata.column_metadata[2].set_name("int32s"); + expected_metadata.column_metadata[3].set_name("floats"); + expected_metadata.column_metadata[4].set_name("doubles"); + + auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(expected_metadata) + .key_value_metadata( + {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}}); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .use_pandas_metadata(true) + .columns({"int32s", "floats", "doubles"}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); +} + +TEST_F(ParquetWriterTest, BufferSource) +{ + constexpr auto num_rows = 100 << 10; + auto const seq_col = random_values(num_rows); + auto const validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + column_wrapper col{seq_col.begin(), seq_col.end(), validity}; + + auto const expected = table_view{{col}}; + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("col_other"); + + std::vector out_buffer; + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), expected) + .metadata(expected_metadata); + cudf::io::write_parquet(out_opts); + + // host buffer + { + cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info(out_buffer.data(), out_buffer.size())); + auto const result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); + } + + // device buffer + { + auto const d_input = cudf::detail::make_device_uvector_sync( + cudf::host_span{reinterpret_cast(out_buffer.data()), + out_buffer.size()}, + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + auto const d_buffer = cudf::device_span( + reinterpret_cast(d_input.data()), d_input.size()); + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(d_buffer)); + auto const result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + cudf::test::expect_metadata_equal(expected_metadata, result.metadata); + } +} + +TEST_F(ParquetWriterTest, ManyFragments) +{ + srand(31337); + auto const expected = create_random_fixed_table(10, 6'000'000, false); + + auto const filepath = temp_env->get_temp_filepath("ManyFragments.parquet"); + cudf::io::parquet_writer_options const args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected) + .max_page_size_bytes(8 * 1024); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options const read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetWriterTest, NonNullable) +{ + srand(31337); + auto expected = create_random_fixed_table(9, 9, false); + + auto filepath = temp_env->get_temp_filepath("NonNullable.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); +} + +TEST_F(ParquetWriterTest, Struct) +{ + // Struct> + + auto names = {"Samuel Vimes", + "Carrot Ironfoundersson", + "Angua von Uberwald", + "Cheery Littlebottom", + "Detritus", + "Mr Slant"}; + + // `Name` column has all valid values. + auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()}; + + auto ages_col = + cudf::test::fixed_width_column_wrapper{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}}; + + auto struct_1 = cudf::test::structs_column_wrapper{{names_col, ages_col}, {1, 1, 1, 1, 0, 1}}; + + auto is_human_col = cudf::test::fixed_width_column_wrapper{ + {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}}; + + auto struct_2 = + cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release(); + + auto expected = table_view({*struct_2}); + + auto filepath = temp_env->get_temp_filepath("Struct.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath)); + cudf::io::read_parquet(read_args); +} + +// custom data sink that supports device writes. uses plain file io. +class custom_test_data_sink : public cudf::io::data_sink { + public: + explicit custom_test_data_sink(std::string const& filepath) + { + outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); + CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); + } + + virtual ~custom_test_data_sink() { flush(); } + + void host_write(void const* data, size_t size) override + { + outfile_.write(static_cast(data), size); + } + + [[nodiscard]] bool supports_device_write() const override { return true; } + + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override + { + this->device_write_async(gpu_data, size, stream).get(); + } + + std::future device_write_async(void const* gpu_data, + size_t size, + rmm::cuda_stream_view stream) override + { + return std::async(std::launch::deferred, [=] { + char* ptr = nullptr; + CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); + CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDefault, stream.value())); + stream.synchronize(); + outfile_.write(ptr, size); + CUDF_CUDA_TRY(cudaFreeHost(ptr)); + }); + } + + void flush() override { outfile_.flush(); } + + size_t bytes_written() override { return outfile_.tellp(); } + + private: + std::ofstream outfile_; +}; + +TEST_F(ParquetWriterTest, CustomDataSink) +{ + auto filepath = temp_env->get_temp_filepath("CustomDataSink.parquet"); + custom_test_data_sink custom_sink(filepath); + + srand(31337); + auto expected = create_random_fixed_table(5, 10, false); + + // write out using the custom sink + { + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + } + + // write out using a memmapped sink + std::vector buf_sink; + { + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buf_sink}, *expected); + cudf::io::write_parquet(args); + } + + // read them back in and make sure everything matches + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); + + cudf::io::parquet_reader_options buf_args = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{buf_sink.data(), buf_sink.size()}); + auto buf_tbl = cudf::io::read_parquet(buf_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(buf_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterTest, DeviceWriteLargeishFile) +{ + auto filepath = temp_env->get_temp_filepath("DeviceWriteLargeishFile.parquet"); + custom_test_data_sink custom_sink(filepath); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_random_fixed_table(4, 4 * 1024 * 1024, false); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterTest, PartitionedWrite) +{ + auto source = create_compressible_fixed_table(16, 4 * 1024 * 1024, 1000, false); + + auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); + auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); + + auto partition1 = cudf::io::partition_info{10, 1024 * 1024}; + auto partition2 = cudf::io::partition_info{20 * 1024 + 7, 3 * 1024 * 1024}; + + auto expected1 = + cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); + auto expected2 = + cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); + + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder( + cudf::io::sink_info(std::vector{filepath1, filepath2}), *source) + .partitions({partition1, partition2}) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(args); + + auto result1 = cudf::io::read_parquet( + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); + + auto result2 = cudf::io::read_parquet( + cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); +} + +template +std::string create_parquet_file(int num_cols) +{ + srand(31337); + auto const table = create_random_fixed_table(num_cols, 10, true); + auto const filepath = + temp_env->get_temp_filepath(typeid(T).name() + std::to_string(num_cols) + ".parquet"); + cudf::io::parquet_writer_options const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table->view()); + cudf::io::write_parquet(out_opts); + return filepath; +} + +TEST_F(ParquetWriterTest, MultipleMismatchedSources) +{ + auto const int5file = create_parquet_file(5); + { + auto const float5file = create_parquet_file(5); + std::vector files{int5file, float5file}; + cudf::io::parquet_reader_options const read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{files}); + EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); + } + { + auto const int10file = create_parquet_file(10); + std::vector files{int5file, int10file}; + cudf::io::parquet_reader_options const read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{files}); + EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error); + } +} + +TEST_F(ParquetWriterTest, Slice) +{ + auto col = + cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}, {true, true, true, false, true}}; + std::vector indices{2, 5}; + std::vector result = cudf::slice(col, indices); + cudf::table_view tbl{result}; + + auto filepath = temp_env->get_temp_filepath("Slice.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto read_table = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl); +} + +TEST_F(ParquetWriterTest, DecimalWrite) +{ + constexpr cudf::size_type num_rows = 500; + auto seq_col0 = random_values(num_rows); + auto seq_col1 = random_values(num_rows); + + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + auto col0 = cudf::test::fixed_point_column_wrapper{ + seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}}; + auto col1 = cudf::test::fixed_point_column_wrapper{ + seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}}; + + auto table = table_view({col0, col1}); + + auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table); + + cudf::io::table_input_metadata expected_metadata(table); + + // verify failure if too small a precision is given + expected_metadata.column_metadata[0].set_decimal_precision(7); + expected_metadata.column_metadata[1].set_decimal_precision(1); + args.set_metadata(expected_metadata); + EXPECT_THROW(cudf::io::write_parquet(args), cudf::logic_error); + + // verify success if equal precision is given + expected_metadata.column_metadata[0].set_decimal_precision(7); + expected_metadata.column_metadata[1].set_decimal_precision(9); + args.set_metadata(std::move(expected_metadata)); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table); +} + +TEST_F(ParquetWriterTest, RowGroupSizeInvalid) +{ + auto const unused_table = std::make_unique
(); + std::vector out_buffer; + + EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), + unused_table->view()) + .row_group_size_rows(0), + cudf::logic_error); + EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), + unused_table->view()) + .max_page_size_rows(0), + cudf::logic_error); + EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), + unused_table->view()) + .row_group_size_bytes(3 << 8), + cudf::logic_error); + EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), + unused_table->view()) + .max_page_size_bytes(3 << 8), + cudf::logic_error); + EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), + unused_table->view()) + .max_page_size_bytes(0xFFFF'FFFFUL), + cudf::logic_error); + + EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) + .row_group_size_rows(0), + cudf::logic_error); + EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) + .max_page_size_rows(0), + cudf::logic_error); + EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) + .row_group_size_bytes(3 << 8), + cudf::logic_error); + EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) + .max_page_size_bytes(3 << 8), + cudf::logic_error); + EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer)) + .max_page_size_bytes(0xFFFF'FFFFUL), + cudf::logic_error); +} + +TEST_F(ParquetWriterTest, RowGroupPageSizeMatch) +{ + auto const unused_table = std::make_unique
(); + std::vector out_buffer; + + auto options = cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), + unused_table->view()) + .row_group_size_bytes(128 * 1024) + .max_page_size_bytes(512 * 1024) + .row_group_size_rows(10000) + .max_page_size_rows(20000) + .build(); + EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes()); + EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows()); +} + +TEST_F(ParquetWriterTest, EmptyList) +{ + auto L1 = cudf::make_lists_column(0, + cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), + cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}), + 0, + {}); + auto L0 = cudf::make_lists_column( + 3, cudf::test::fixed_width_column_wrapper{0, 0, 0, 0}.release(), std::move(L1), 0, {}); + + auto filepath = temp_env->get_temp_filepath("EmptyList.parquet"); + cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath), + cudf::table_view({*L0}))); + + auto result = cudf::io::read_parquet( + cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath))); + + using lcw = cudf::test::lists_column_wrapper; + auto expected = lcw{lcw{}, lcw{}, lcw{}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), expected); +} + +TEST_F(ParquetWriterTest, DeepEmptyList) +{ + // Make a list column LLLi st only L is valid and LLi are all null. This tests whether we can + // handle multiple nullptr offsets + + auto L2 = cudf::make_lists_column(0, + cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), + cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}), + 0, + {}); + auto L1 = cudf::make_lists_column( + 0, cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), std::move(L2), 0, {}); + auto L0 = cudf::make_lists_column( + 3, cudf::test::fixed_width_column_wrapper{0, 0, 0, 0}.release(), std::move(L1), 0, {}); + + auto filepath = temp_env->get_temp_filepath("DeepEmptyList.parquet"); + cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath), + cudf::table_view({*L0}))); + + auto result = cudf::io::read_parquet( + cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath))); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), *L0); +} + +TEST_F(ParquetWriterTest, EmptyListWithStruct) +{ + auto L2 = cudf::make_lists_column(0, + cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), + cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}), + 0, + {}); + + auto children = std::vector>{}; + children.push_back(std::move(L2)); + auto S2 = cudf::make_structs_column(0, std::move(children), 0, {}); + auto L1 = cudf::make_lists_column( + 0, cudf::make_empty_column(cudf::data_type(cudf::type_id::INT32)), std::move(S2), 0, {}); + auto L0 = cudf::make_lists_column( + 3, cudf::test::fixed_width_column_wrapper{0, 0, 0, 0}.release(), std::move(L1), 0, {}); + + auto filepath = temp_env->get_temp_filepath("EmptyListWithStruct.parquet"); + cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath), + cudf::table_view({*L0}))); + auto result = cudf::io::read_parquet( + cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath))); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), *L0); +} + +TEST_F(ParquetWriterTest, CheckPageRows) +{ + auto sequence = thrust::make_counting_iterator(0); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + constexpr auto page_rows = 5000; + constexpr auto num_rows = 2 * page_rows; + column_wrapper col(sequence, sequence + num_rows, validity); + + auto expected = table_view{{col}}; + + auto const filepath = temp_env->get_temp_filepath("CheckPageRows.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_size_rows(page_rows); + cudf::io::write_parquet(out_opts); + + // check first page header and make sure it has only page_rows values + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + ASSERT_GT(fmd.row_groups.size(), 0); + ASSERT_EQ(fmd.row_groups[0].columns.size(), 1); + auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; + ASSERT_GT(first_chunk.data_page_offset, 0); + + // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded + // version should be smaller than size of the struct. + auto const ph = read_page_header( + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); + + EXPECT_EQ(ph.data_page_header.num_values, page_rows); +} + +TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) +{ + // enough for a few pages with the default 20'000 rows/page + constexpr auto rows_per_page = 20'000; + constexpr auto num_rows = 3 * rows_per_page; + const std::string s1(32, 'a'); + auto col0_elements = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; }); + auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows); + + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("CheckPageRowsAdjusted.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_size_rows(rows_per_page); + cudf::io::write_parquet(out_opts); + + // check first page header and make sure it has only page_rows values + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + ASSERT_GT(fmd.row_groups.size(), 0); + ASSERT_EQ(fmd.row_groups[0].columns.size(), 1); + auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; + ASSERT_GT(first_chunk.data_page_offset, 0); + + // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded + // version should be smaller than size of the struct. + auto const ph = read_page_header( + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); + + EXPECT_LE(ph.data_page_header.num_values, rows_per_page); +} + +TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) +{ + constexpr auto rows_per_page = 1'000; + constexpr auto fragment_size = 5'000; + constexpr auto num_rows = 3 * rows_per_page; + const std::string s1(32, 'a'); + auto col0_elements = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return s1; }); + auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + num_rows); + + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("CheckPageRowsTooSmall.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_fragment_size(fragment_size) + .max_page_size_rows(rows_per_page); + cudf::io::write_parquet(out_opts); + + // check that file is written correctly when rows/page < fragment size + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + ASSERT_TRUE(fmd.row_groups.size() > 0); + ASSERT_TRUE(fmd.row_groups[0].columns.size() == 1); + auto const& first_chunk = fmd.row_groups[0].columns[0].meta_data; + ASSERT_TRUE(first_chunk.data_page_offset > 0); + + // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded + // version should be smaller than size of the struct. + auto const ph = read_page_header( + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); + + // there should be only one page since the fragment size is larger than rows_per_page + EXPECT_EQ(ph.data_page_header.num_values, num_rows); +} + +TEST_F(ParquetWriterTest, Decimal128Stats) +{ + // check that decimal128 min and max statistics are written in network byte order + // this is negative, so should be the min + std::vector expected_min{ + 0xa1, 0xb2, 0xc3, 0xd4, 0xe5, 0xf6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector expected_max{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa1, 0xb2, 0xc3, 0xd4, 0xe5, 0xf6}; + + __int128_t val0 = 0xa1b2'c3d4'e5f6ULL; + __int128_t val1 = val0 << 80; + column_wrapper col0{{numeric::decimal128(val0, numeric::scale_type{0}), + numeric::decimal128(val1, numeric::scale_type{0})}}; + + auto expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("Decimal128Stats.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + auto const stats = get_statistics(fmd.row_groups[0].columns[0]); + + EXPECT_EQ(expected_min, stats.min_value); + EXPECT_EQ(expected_max, stats.max_value); +} + +TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) +{ + char const* coldata[] = { + // in-range 7 bit. should truncate to "yyyyyyyz" + "yyyyyyyyy", + // max 7 bit. should truncate to "x7fx7fx7fx7fx7fx7fx7fx80", since it's + // considered binary, not UTF-8. If UTF-8 it should not truncate. + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + // max binary. this should not truncate + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + // in-range 2-byte UTF8 (U+00E9). should truncate to "éééê" + "ééééé", + // max 2-byte UTF8 (U+07FF). should not truncate + "߿߿߿߿߿", + // in-range 3-byte UTF8 (U+0800). should truncate to "ࠀࠁ" + "ࠀࠀࠀ", + // max 3-byte UTF8 (U+FFFF). should not truncate + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + // in-range 4-byte UTF8 (U+10000). should truncate to "𐀀𐀁" + "𐀀𐀀𐀀", + // max unicode (U+10FFFF). should truncate to \xf4\x8f\xbf\xbf\xf4\x90\x80\x80, + // which is no longer valid unicode, but is still ok UTF-8??? + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + // max 4-byte UTF8 (U+1FFFFF). should not truncate + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + + // NOTE: UTF8 min is initialized with 0xf7bfbfbf. Binary values larger + // than that will not become minimum value (when written as UTF-8). + char const* truncated_min[] = {"yyyyyyyy", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f", + "\xf7\xbf\xbf\xbf", + "éééé", + "߿߿߿߿", + "ࠀࠀ", + "\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀀", + "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf", + "\xf7\xbf\xbf\xbf"}; + + char const* truncated_max[] = {"yyyyyyyz", + "\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x80", + "\xff\xff\xff\xff\xff\xff\xff\xff\xff", + "éééê", + "߿߿߿߿߿", + "ࠀࠁ", + "\xef\xbf\xbf\xef\xbf\xbf\xef\xbf\xbf", + "𐀀𐀁", + "\xf4\x8f\xbf\xbf\xf4\x90\x80\x80", + "\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf\xf7\xbf\xbf\xbf"}; + + auto cols = [&]() { + using string_wrapper = column_wrapper; + std::vector> cols; + for (auto const str : coldata) { + cols.push_back(string_wrapper{str}.release()); + } + return cols; + }(); + auto expected = std::make_unique
(std::move(cols)); + + auto const filepath = temp_env->get_temp_filepath("CheckColumnIndexTruncation.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected->view()) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .column_index_truncate_length(8); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max + auto const ptype = fmd.schema[c + 1].type; + auto const ctype = fmd.schema[c + 1].converted_type; + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); + + // check that truncated values == expected + EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0); + EXPECT_EQ(memcmp(ci.max_values[0].data(), truncated_max[c], ci.max_values[0].size()), 0); + } + } +} + +TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) +{ + std::vector truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, + {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + + std::vector truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, + {0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + + cudf::test::lists_column_wrapper col0{ + {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}}; + cudf::test::lists_column_wrapper col1{ + {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + cudf::test::lists_column_wrapper col2{ + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + + auto expected = table_view{{col0, col1, col2}}; + + cudf::io::table_input_metadata output_metadata(expected); + output_metadata.column_metadata[0].set_name("col_binary0").set_output_as_binary(true); + output_metadata.column_metadata[1].set_name("col_binary1").set_output_as_binary(true); + output_metadata.column_metadata[2].set_name("col_binary2").set_output_as_binary(true); + + auto const filepath = temp_env->get_temp_filepath("BinaryColumnIndexTruncation.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(output_metadata)) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .column_index_truncate_length(8); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + for (size_t r = 0; r < fmd.row_groups.size(); r++) { + auto const& rg = fmd.row_groups[r]; + for (size_t c = 0; c < rg.columns.size(); c++) { + auto const& chunk = rg.columns[c]; + + auto const ci = read_column_index(source, chunk); + auto const stats = get_statistics(chunk); + + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max + auto const ptype = fmd.schema[c + 1].type; + auto const ctype = fmd.schema[c + 1].converted_type; + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); + + // check that truncated values == expected + EXPECT_EQ(ci.min_values[0], truncated_min[c]); + EXPECT_EQ(ci.max_values[0], truncated_max[c]); + } + } +} + +TEST_F(ParquetWriterTest, ByteArrayStats) +{ + // check that byte array min and max statistics are written as expected. If a byte array is + // written as a string, max utf8 is 0xf7bfbfbf and so the minimum value will be set to that value + // instead of a potential minimum higher than that. + std::vector expected_col0_min{0xf0}; + std::vector expected_col0_max{0xf0, 0xf5, 0xf5}; + std::vector expected_col1_min{0xfe, 0xfe, 0xfe}; + std::vector expected_col1_max{0xfe, 0xfe, 0xfe}; + + cudf::test::lists_column_wrapper list_int_col0{ + {0xf0}, {0xf0, 0xf5, 0xf3}, {0xf0, 0xf5, 0xf5}}; + cudf::test::lists_column_wrapper list_int_col1{ + {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}}; + + auto expected = table_view{{list_int_col0, list_int_col1}}; + cudf::io::table_input_metadata output_metadata(expected); + output_metadata.column_metadata[0].set_name("col_binary0").set_output_as_binary(true); + output_metadata.column_metadata[1].set_name("col_binary1").set_output_as_binary(true); + + auto filepath = temp_env->get_temp_filepath("ByteArrayStats.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(output_metadata)); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .set_column_schema({{}, {}}); + auto result = cudf::io::read_parquet(in_opts); + + auto source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + + auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); + auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); + + EXPECT_EQ(expected_col0_min, stats0.min_value); + EXPECT_EQ(expected_col0_max, stats0.max_value); + EXPECT_EQ(expected_col1_min, stats1.min_value); + EXPECT_EQ(expected_col1_max, stats1.max_value); +} + +TEST_F(ParquetWriterTest, SingleValueDictionaryTest) +{ + constexpr unsigned int expected_bits = 1; + constexpr unsigned int nrows = 1'000'000U; + + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "a unique string value suffixed with 1"; }); + auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("SingleValueDictionaryTest.parquet"); + // set row group size so that there will be only one row group + // no compression so we can easily read page data + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::NONE) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .row_group_size_rows(nrows); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + // make sure dictionary was used + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + auto used_dict = [&fmd]() { + for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { + return true; + } + } + return false; + }; + EXPECT_TRUE(used_dict()); + + // and check that the correct number of bits was used + auto const oi = read_offset_index(source, fmd.row_groups[0].columns[0]); + auto const nbits = read_dict_bits(source, oi.page_locations[0]); + EXPECT_EQ(nbits, expected_bits); +} + +TEST_F(ParquetWriterTest, DictionaryNeverTest) +{ + constexpr unsigned int nrows = 1'000U; + + // only one value, so would normally use dictionary + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "a unique string value suffixed with 1"; }); + auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("DictionaryNeverTest.parquet"); + // no compression so we can easily read page data + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::NONE) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + // make sure dictionary was not used + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + auto used_dict = [&fmd]() { + for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { + return true; + } + } + return false; + }; + EXPECT_FALSE(used_dict()); +} + +TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) +{ + constexpr unsigned int nrows = 65'536U; + // cardinality is chosen to result in a dictionary > 1MB in size + constexpr unsigned int cardinality = 32'768U; + + // single value will have a small dictionary + auto elements0 = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "a unique string value suffixed with 1"; }); + auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows); + + // high cardinality will have a large dictionary + auto elements1 = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { + return "a unique string value suffixed with " + std::to_string(i % cardinality); + }); + auto const col1 = cudf::test::strings_column_wrapper(elements1, elements1 + nrows); + + auto const expected = table_view{{col0, col1}}; + + auto const filepath = temp_env->get_temp_filepath("DictionaryAdaptiveTest.parquet"); + // no compression so we can easily read page data + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::ZSTD) + .dictionary_policy(cudf::io::dictionary_policy::ADAPTIVE); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + // make sure dictionary was used as expected. col0 should use one, + // col1 should not. + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + auto used_dict = [&fmd](int col) { + for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { + return true; + } + } + return false; + }; + EXPECT_TRUE(used_dict(0)); + EXPECT_FALSE(used_dict(1)); +} + +TEST_F(ParquetWriterTest, DictionaryAlwaysTest) +{ + constexpr unsigned int nrows = 65'536U; + // cardinality is chosen to result in a dictionary > 1MB in size + constexpr unsigned int cardinality = 32'768U; + + // single value will have a small dictionary + auto elements0 = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "a unique string value suffixed with 1"; }); + auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows); + + // high cardinality will have a large dictionary + auto elements1 = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { + return "a unique string value suffixed with " + std::to_string(i % cardinality); + }); + auto const col1 = cudf::test::strings_column_wrapper(elements1, elements1 + nrows); + + auto const expected = table_view{{col0, col1}}; + + auto const filepath = temp_env->get_temp_filepath("DictionaryAlwaysTest.parquet"); + // no compression so we can easily read page data + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::ZSTD) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + // make sure dictionary was used for both columns + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + auto used_dict = [&fmd](int col) { + for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { + return true; + } + } + return false; + }; + EXPECT_TRUE(used_dict(0)); + EXPECT_TRUE(used_dict(1)); +} + +TEST_F(ParquetWriterTest, DictionaryPageSizeEst) +{ + // one page + constexpr unsigned int nrows = 20'000U; + + // this test is creating a pattern of repeating then non-repeating values to trigger + // a "worst-case" for page size estimation in the presence of a dictionary. have confirmed + // that this fails for values over 16 in the final term of `max_RLE_page_size()`. + // The output of the iterator will be 'CCCCCRRRRRCCCCCRRRRR...` where 'C' is a changing + // value, and 'R' repeats. The encoder will turn this into a literal run of 8 values + // (`CCCCCRRR`) followed by a repeated run of 2 (`RR`). This pattern then repeats, getting + // as close as possible to a condition of repeated 8 value literal runs. + auto elements0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + if ((i / 5) % 2 == 1) { + return std::string("non-unique string"); + } else { + return "a unique string value suffixed with " + std::to_string(i); + } + }); + auto const col0 = cudf::test::strings_column_wrapper(elements0, elements0 + nrows); + + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("DictionaryPageSizeEst.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::ZSTD) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TEST_F(ParquetWriterTest, UserNullability) +{ + auto weight_col = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}}; + auto ages_col = cudf::test::fixed_width_column_wrapper{{30, 27, 5}}; + auto struct_col = cudf::test::structs_column_wrapper{weight_col, ages_col}; + + auto expected = table_view({struct_col}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_nullability(false); + expected_metadata.column_metadata[0].child(0).set_nullability(true); + + auto filepath = temp_env->get_temp_filepath("SingleWriteNullable.parquet"); + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(expected_metadata)); + cudf::io::write_parquet(write_opts); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + EXPECT_FALSE(result.tbl->view().column(0).nullable()); + EXPECT_TRUE(result.tbl->view().column(0).child(0).nullable()); + EXPECT_FALSE(result.tbl->view().column(0).child(1).nullable()); +} + +TEST_F(ParquetWriterTest, UserNullabilityInvalid) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; }); + auto col = cudf::test::fixed_width_column_wrapper{{57.5, 51.1, 15.3}, valids}; + auto expected = table_view({col}); + + auto filepath = temp_env->get_temp_filepath("SingleWriteNullableInvalid.parquet"); + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + // Should work without the nullability option + EXPECT_NO_THROW(cudf::io::write_parquet(write_opts)); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_nullability(false); + write_opts.set_metadata(std::move(expected_metadata)); + // Can't write a column with nulls as not nullable + EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error); +} + +TEST_F(ParquetWriterTest, CompStats) +{ + auto table = create_random_fixed_table(1, 100000, true); + + auto const stats = std::make_shared(); + + std::vector unused_buffer; + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}, table->view()) + .compression_statistics(stats); + cudf::io::write_parquet(opts); + + EXPECT_NE(stats->num_compressed_bytes(), 0); + EXPECT_EQ(stats->num_failed_bytes(), 0); + EXPECT_EQ(stats->num_skipped_bytes(), 0); + EXPECT_FALSE(std::isnan(stats->compression_ratio())); +} + +TEST_F(ParquetWriterTest, CompStatsEmptyTable) +{ + auto table_no_rows = create_random_fixed_table(20, 0, false); + + auto const stats = std::make_shared(); + + std::vector unused_buffer; + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&unused_buffer}, + table_no_rows->view()) + .compression_statistics(stats); + cudf::io::write_parquet(opts); + + expect_compression_stats_empty(stats); +} + +TEST_F(ParquetWriterTest, NoNullsAsNonNullable) +{ + auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + column_wrapper col{{1, 2, 3}, valids}; + table_view expected({col}); + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_nullability(false); + + auto filepath = temp_env->get_temp_filepath("NonNullable.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(std::move(expected_metadata)); + // Writer should be able to write a column without nulls as non-nullable + EXPECT_NO_THROW(cudf::io::write_parquet(out_opts)); +} + +TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow) +{ + using namespace cuda::std::chrono; + using namespace cudf::io; + + column_wrapper big_ts_col{ + sys_days{year{3023} / month{7} / day{14}} + 7h + 38min + 45s + 418688us, + sys_days{year{723} / month{3} / day{21}} + 14h + 20min + 13s + microseconds{781ms}}; + + table_view expected({big_ts_col}); + auto filepath = temp_env->get_temp_filepath("BigINT96Timestamp.parquet"); + + auto const out_opts = + parquet_writer_options::builder(sink_info{filepath}, expected).int96_timestamps(true).build(); + write_parquet(out_opts); + + auto const in_opts = parquet_reader_options::builder(source_info(filepath)) + .timestamp_type(cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS)) + .build(); + auto const result = read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TEST_F(ParquetWriterTest, PreserveNullability) +{ + constexpr auto num_rows = 100; + + auto const col0_data = random_values(num_rows); + auto const col1_data = random_values(num_rows); + + auto const col0_validity = cudf::test::iterators::no_nulls(); + auto const col1_validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); + + column_wrapper col0{col0_data.begin(), col0_data.end(), col0_validity}; + column_wrapper col1{col1_data.begin(), col1_data.end(), col1_validity}; + auto const col2 = make_parquet_list_list_col(0, num_rows, 5, 8, true); + + auto const expected = table_view{{col0, col1, *col2}}; + + cudf::io::table_input_metadata expected_metadata(expected); + expected_metadata.column_metadata[0].set_name("mandatory"); + expected_metadata.column_metadata[0].set_nullability(false); + expected_metadata.column_metadata[1].set_name("optional"); + expected_metadata.column_metadata[1].set_nullability(true); + expected_metadata.column_metadata[2].set_name("lists"); + expected_metadata.column_metadata[2].set_nullability(true); + // offsets is a cudf thing that's not part of the parquet schema so it won't have nullability set + expected_metadata.column_metadata[2].child(0).set_name("offsets"); + expected_metadata.column_metadata[2].child(1).set_name("element"); + expected_metadata.column_metadata[2].child(1).set_nullability(false); + expected_metadata.column_metadata[2].child(1).child(0).set_name("offsets"); + expected_metadata.column_metadata[2].child(1).child(1).set_name("element"); + expected_metadata.column_metadata[2].child(1).child(1).set_nullability(true); + + auto const filepath = temp_env->get_temp_filepath("PreserveNullability.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .metadata(expected_metadata); + + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options const in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(in_opts); + auto const read_metadata = cudf::io::table_input_metadata{result.metadata}; + + // test that expected_metadata matches read_metadata + std::function + compare_names_and_nullability = [&](auto lhs, auto rhs) { + EXPECT_EQ(lhs.get_name(), rhs.get_name()); + ASSERT_EQ(lhs.is_nullability_defined(), rhs.is_nullability_defined()); + if (lhs.is_nullability_defined()) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); } + ASSERT_EQ(lhs.num_children(), rhs.num_children()); + for (int i = 0; i < lhs.num_children(); ++i) { + compare_names_and_nullability(lhs.child(i), rhs.child(i)); + } + }; + + ASSERT_EQ(expected_metadata.column_metadata.size(), read_metadata.column_metadata.size()); + + for (size_t i = 0; i < expected_metadata.column_metadata.size(); ++i) { + compare_names_and_nullability(expected_metadata.column_metadata[i], + read_metadata.column_metadata[i]); + } +} + +TEST_F(ParquetWriterTest, EmptyMinStringStatistics) +{ + char const* const min_val = ""; + char const* const max_val = "zzz"; + std::vector strings{min_val, max_val, "pining", "for", "the", "fjords"}; + + column_wrapper string_col{strings.begin(), strings.end()}; + auto const output = table_view{{string_col}}; + auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + ASSERT_TRUE(fmd.row_groups.size() > 0); + ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0); + auto const& chunk = fmd.row_groups[0].columns[0]; + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const min_value = std::string{reinterpret_cast(stats.min_value.value().data()), + stats.min_value.value().size()}; + auto const max_value = std::string{reinterpret_cast(stats.max_value.value().data()), + stats.max_value.value().size()}; + EXPECT_EQ(min_value, std::string(min_val)); + EXPECT_EQ(max_value, std::string(max_val)); +} + +// custom mem mapped data sink that supports device writes +template +class custom_test_memmap_sink : public cudf::io::data_sink { + public: + explicit custom_test_memmap_sink(std::vector* mm_writer_buf) + { + mm_writer = cudf::io::data_sink::create(mm_writer_buf); + } + + virtual ~custom_test_memmap_sink() { mm_writer->flush(); } + + void host_write(void const* data, size_t size) override { mm_writer->host_write(data, size); } + + [[nodiscard]] bool supports_device_write() const override { return supports_device_writes; } + + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override + { + this->device_write_async(gpu_data, size, stream).get(); + } + + std::future device_write_async(void const* gpu_data, + size_t size, + rmm::cuda_stream_view stream) override + { + return std::async(std::launch::deferred, [=] { + char* ptr = nullptr; + CUDF_CUDA_TRY(cudaMallocHost(&ptr, size)); + CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDefault, stream.value())); + stream.synchronize(); + mm_writer->host_write(ptr, size); + CUDF_CUDA_TRY(cudaFreeHost(ptr)); + }); + } + + void flush() override { mm_writer->flush(); } + + size_t bytes_written() override { return mm_writer->bytes_written(); } + + private: + std::unique_ptr mm_writer; +}; + +//////////////////////// +// Numeric type tests + +// Typed test fixture for numeric type tests +template +struct ParquetWriterNumericTypeTest : public ParquetWriterTest { + auto type() { return cudf::data_type{cudf::type_to_id()}; } +}; + +TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes); + +TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn) +{ + auto sequence = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + constexpr auto num_rows = 800; + column_wrapper col(sequence, sequence + num_rows, validity); + + auto expected = table_view{{col}}; + + auto filepath = temp_env->get_temp_filepath("SingleColumn.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumnWithNulls) +{ + auto sequence = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i); }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); + + constexpr auto num_rows = 100; + column_wrapper col(sequence, sequence + num_rows, validity); + + auto expected = table_view{{col}}; + + auto filepath = temp_env->get_temp_filepath("SingleColumnWithNulls.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +///////////////////////// +// timestamp type tests + +// Typed test fixture for timestamp type tests +template +struct ParquetWriterTimestampTypeTest : public ParquetWriterTest { + auto type() { return cudf::data_type{cudf::type_to_id()}; } +}; + +TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes); + +TYPED_TEST(ParquetWriterTimestampTypeTest, Timestamps) +{ + auto sequence = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return ((std::rand() / 10000) * 1000); }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + constexpr auto num_rows = 100; + column_wrapper col( + sequence, sequence + num_rows, validity); + + auto expected = table_view{{col}}; + + auto filepath = temp_env->get_temp_filepath("Timestamps.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .timestamp_type(this->type()); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampsWithNulls) +{ + auto sequence = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return ((std::rand() / 10000) * 1000); }); + auto validity = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 30) && (i < 60); }); + + constexpr auto num_rows = 100; + column_wrapper col( + sequence, sequence + num_rows, validity); + + auto expected = table_view{{col}}; + + auto filepath = temp_env->get_temp_filepath("TimestampsWithNulls.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .timestamp_type(this->type()); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow) +{ + constexpr int64_t max = std::numeric_limits::max(); + auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + constexpr auto num_rows = 100; + column_wrapper col( + sequence, sequence + num_rows, validity); + table_view expected({col}); + + auto filepath = temp_env->get_temp_filepath("ParquetTimestampOverflow.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .timestamp_type(this->type()); + auto result = cudf::io::read_parquet(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +////////////////////////////// +// writer stress tests + +// Base test fixture for "stress" tests +struct ParquetWriterStressTest : public cudf::test::BaseFixture {}; + +TEST_F(ParquetWriterStressTest, LargeTableWeakCompression) +{ + std::vector mm_buf; + mm_buf.reserve(4 * 1024 * 1024 * 16); + custom_test_memmap_sink custom_sink(&mm_buf); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_random_fixed_table(16, 4 * 1024 * 1024, false); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterStressTest, LargeTableGoodCompression) +{ + std::vector mm_buf; + mm_buf.reserve(4 * 1024 * 1024 * 16); + custom_test_memmap_sink custom_sink(&mm_buf); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 128 * 1024, false); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterStressTest, LargeTableWithValids) +{ + std::vector mm_buf; + mm_buf.reserve(4 * 1024 * 1024 * 16); + custom_test_memmap_sink custom_sink(&mm_buf); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 6, true); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableWeakCompression) +{ + std::vector mm_buf; + mm_buf.reserve(4 * 1024 * 1024 * 16); + custom_test_memmap_sink custom_sink(&mm_buf); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_random_fixed_table(16, 4 * 1024 * 1024, false); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableGoodCompression) +{ + std::vector mm_buf; + mm_buf.reserve(4 * 1024 * 1024 * 16); + custom_test_memmap_sink custom_sink(&mm_buf); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 128 * 1024, false); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} + +TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableWithValids) +{ + std::vector mm_buf; + mm_buf.reserve(4 * 1024 * 1024 * 16); + custom_test_memmap_sink custom_sink(&mm_buf); + + // exercises multiple rowgroups + srand(31337); + auto expected = create_compressible_fixed_table(16, 4 * 1024 * 1024, 6, true); + + // write out using the custom sink (which uses device writes) + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options custom_args = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()}); + auto custom_tbl = cudf::io::read_parquet(custom_args); + CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); +} diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp index b4583ac4f17..0c259c81a23 100644 --- a/cpp/tests/io/row_selection_test.cpp +++ b/cpp/tests/io/row_selection_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp index e186dcda9cb..6f46df20633 100644 --- a/cpp/tests/io/text/data_chunk_source_test.cpp +++ b/cpp/tests/io/text/data_chunk_source_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index afa02741be1..b207c3f15a6 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu index a14e7ecf5b3..3bb15a59aa3 100644 --- a/cpp/tests/io/type_inference_test.cu +++ b/cpp/tests/io/type_inference_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu index d70a074692e..98befb0a3ee 100644 --- a/cpp/tests/iterator/optional_iterator_test_numeric.cu +++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ */ #include +#include + #include #include diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu index 420b3d4aff5..3447aa0dde6 100644 --- a/cpp/tests/iterator/pair_iterator_test_numeric.cu +++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ */ #include +#include + #include #include diff --git a/cpp/tests/iterator/scalar_iterator_test.cu b/cpp/tests/iterator/scalar_iterator_test.cu index b867703535e..8d24dc41e56 100644 --- a/cpp/tests/iterator/scalar_iterator_test.cu +++ b/cpp/tests/iterator/scalar_iterator_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ */ #include +#include + #include #include diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp index a25b19e99d8..22bc7475dbe 100644 --- a/cpp/tests/iterator/value_iterator.cpp +++ b/cpp/tests/iterator/value_iterator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,5 +14,6 @@ */ #include +#include CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/iterator/value_iterator_test_transform.cu b/cpp/tests/iterator/value_iterator_test_transform.cu index a309cfd6327..417233e759b 100644 --- a/cpp/tests/iterator/value_iterator_test_transform.cu +++ b/cpp/tests/iterator/value_iterator_test_transform.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ */ #include +#include + #include #include #include diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp index 5f00c5f561a..f6df2abc01a 100644 --- a/cpp/tests/jit/parse_ptx_function.cpp +++ b/cpp/tests/jit/parse_ptx_function.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include struct JitParseTest : public ::testing::Test {}; diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index a416df0c7c3..651e44511fb 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp index 91aa13a2cd2..2356c7e5ce1 100644 --- a/cpp/tests/labeling/label_bins_tests.cpp +++ b/cpp/tests/labeling/label_bins_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp index 017cd471e01..92dd5df5ec7 100644 --- a/cpp/tests/lists/extract_tests.cpp +++ b/cpp/tests/lists/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 3558e5676dd..d73c3192549 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp index f3d131ae50c..f1486a49bf9 100644 --- a/cpp/tests/partitioning/hash_partition_test.cpp +++ b/cpp/tests/partitioning/hash_partition_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp index a607376b0a5..b25a4d6c666 100644 --- a/cpp/tests/quantiles/quantile_test.cpp +++ b/cpp/tests/quantiles/quantile_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 7644ac48892..905cd67bc95 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp index a13829c5abc..74ee27137ed 100644 --- a/cpp/tests/replace/clamp_test.cpp +++ b/cpp/tests/replace/clamp_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp index 80b9416eaaa..50736940520 100644 --- a/cpp/tests/replace/normalize_replace_tests.cpp +++ b/cpp/tests/replace/normalize_replace_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include // This is the main test fixture struct ReplaceTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp index c74f6d6e6a6..35232204db7 100644 --- a/cpp/tests/replace/replace_nans_tests.cpp +++ b/cpp/tests/replace/replace_nans_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index a7c54145708..6c23dd6bdc8 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018 BlazingDB, Inc. * Copyright 2018 Alexander Ocsa @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index 63460b0cb15..8b953079d34 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018 BlazingDB, Inc. * Copyright 2018 Cristhian Alberto Gonzales Castillo @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp index e8ea9d619c5..bc7488bbf9e 100644 --- a/cpp/tests/reshape/interleave_columns_tests.cpp +++ b/cpp/tests/reshape/interleave_columns_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index d0181974479..c0307000f5c 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp index f97bb7a5323..64d08fa7338 100644 --- a/cpp/tests/round/round_tests.cpp +++ b/cpp/tests/round/round_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp index 7da5c408a48..5f132f3ace9 100644 --- a/cpp/tests/scalar/factories_test.cpp +++ b/cpp/tests/scalar/factories_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index d2f2b5d6a2e..cb689abb8d8 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp index efff7cd31c6..7550cc27161 100644 --- a/cpp/tests/search/search_test.cpp +++ b/cpp/tests/search/search_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp index ba7c9d82533..271b119ff80 100644 --- a/cpp/tests/sort/is_sorted_tests.cpp +++ b/cpp/tests/sort/is_sorted_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index e0828b7a0db..e84275f41ef 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 7ee090cc13f..cb7d11dab35 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,9 @@ #include #include #include +#include #include +#include #include #include diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index c7ceb899833..d1e0dfb1ff1 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index b7617fc5724..fb25c67b763 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp index f8d94afdb20..0d6b870c33b 100644 --- a/cpp/tests/table/table_tests.cpp +++ b/cpp/tests/table/table_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index 7b179588385..c5a5a342471 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index b7950052cf9..ce8ed9285fe 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index eff327e75be..5fa02d9978a 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -21,6 +21,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp index 6d8f714401c..e54929f1651 100644 --- a/cpp/tests/transform/row_conversion.cpp +++ b/cpp/tests/transform/row_conversion.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp index cf46dd74138..59094db6cc3 100644 --- a/cpp/tests/transpose/transpose_test.cpp +++ b/cpp/tests/transpose/transpose_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp index 32d55624fc6..53bf224649e 100644 --- a/cpp/tests/types/traits_test.cpp +++ b/cpp/tests/types/traits_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu index 7b2384d38cb..d7df8f03ec1 100644 --- a/cpp/tests/types/type_dispatcher_test.cu +++ b/cpp/tests/types/type_dispatcher_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index 76d1f769856..19c18a8b0c1 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 9c2aac0a0b5..f5b7a499243 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp index 9d2cc257f02..da17e33e11a 100644 --- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu index 881a237de72..870528d306c 100644 --- a/cpp/tests/utilities_tests/span_tests.cu +++ b/cpp/tests/utilities_tests/span_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index f7d3df18ffd..4086c5a91bb 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include From 433bdc32eeecf8fa2fcd900214a9005eb4a07992 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 9 Jan 2024 14:54:16 -0800 Subject: [PATCH 02/23] Convert `FieldType` to scoped enum (#14642) Switch to scoped enum (`enum class`); they are better because, well, values now have a scope. Another benefit in this case - values are now named consistently with compact protocol. De-duplicated some code, now that more static_casts are required and duplication stands out more. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14642 --- .../io/parquet/compact_protocol_reader.cpp | 110 ++++++++++-------- .../io/parquet/compact_protocol_writer.cpp | 43 +++---- .../io/parquet/compact_protocol_writer.hpp | 11 +- cpp/src/io/parquet/page_enc.cu | 49 ++++---- cpp/src/io/parquet/page_hdr.cu | 34 +++--- cpp/src/io/parquet/parquet_common.hpp | 29 ++--- 6 files changed, 153 insertions(+), 123 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 55848802f12..d39d832c18c 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,28 +45,37 @@ class parquet_field { std::string field_type_string(FieldType type) { switch (type) { - case ST_FLD_TRUE: return "bool(true)"; - case ST_FLD_FALSE: return "bool(false)"; - case ST_FLD_BYTE: return "int8"; - case ST_FLD_I16: return "int16"; - case ST_FLD_I32: return "int32"; - case ST_FLD_I64: return "int64"; - case ST_FLD_DOUBLE: return "double"; - case ST_FLD_BINARY: return "binary"; - case ST_FLD_STRUCT: return "struct"; - case ST_FLD_LIST: return "list"; - case ST_FLD_SET: return "set"; - default: return "unknown(" + std::to_string(type) + ")"; + case FieldType::BOOLEAN_TRUE: return "bool(true)"; + case FieldType::BOOLEAN_FALSE: return "bool(false)"; + case FieldType::I8: return "int8"; + case FieldType::I16: return "int16"; + case FieldType::I32: return "int32"; + case FieldType::I64: return "int64"; + case FieldType::DOUBLE: return "double"; + case FieldType::BINARY: return "binary"; + case FieldType::LIST: return "list"; + case FieldType::SET: return "set"; + case FieldType::MAP: return "map"; + case FieldType::STRUCT: return "struct"; + case FieldType::UUID: return "UUID"; + default: return "unknown(" + std::to_string(static_cast(type)) + ")"; } } void assert_field_type(int type, FieldType expected) { - CUDF_EXPECTS(type == expected, + CUDF_EXPECTS(type == static_cast(expected), "expected " + field_type_string(expected) + " field, got " + field_type_string(static_cast(type)) + " field instead"); } +void assert_bool_field_type(int type) +{ + auto const field_type = static_cast(type); + CUDF_EXPECTS(field_type == FieldType::BOOLEAN_TRUE || field_type == FieldType::BOOLEAN_FALSE, + "expected bool field, got " + field_type_string(field_type) + " field instead"); +} + /** * @brief Abstract base class for list functors. */ @@ -86,7 +95,7 @@ class parquet_field_list : public parquet_field { public: inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_LIST); + assert_field_type(field_type, FieldType::LIST); auto const [t, n] = cpr->get_listh(); assert_field_type(t, EXPECTED_ELEM_TYPE); val.resize(n); @@ -111,8 +120,8 @@ class parquet_field_bool : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - CUDF_EXPECTS(field_type == ST_FLD_TRUE || field_type == ST_FLD_FALSE, "expected bool field"); - val = field_type == ST_FLD_TRUE; + assert_bool_field_type(field_type); + val = field_type == static_cast(FieldType::BOOLEAN_TRUE); } }; @@ -122,14 +131,13 @@ class parquet_field_bool : public parquet_field { * @return True if field types mismatch or if the process of reading a * bool fails */ -struct parquet_field_bool_list : public parquet_field_list { +struct parquet_field_bool_list : public parquet_field_list { parquet_field_bool_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { auto const current_byte = cpr->getb(); - CUDF_EXPECTS(current_byte == ST_FLD_TRUE || current_byte == ST_FLD_FALSE, - "expected bool field"); - this->val[i] = current_byte == ST_FLD_TRUE; + assert_bool_field_type(current_byte); + this->val[i] = current_byte == static_cast(FieldType::BOOLEAN_TRUE); }; bind_read_func(read_value); } @@ -162,9 +170,9 @@ class parquet_field_int : public parquet_field { } }; -using parquet_field_int8 = parquet_field_int; -using parquet_field_int32 = parquet_field_int; -using parquet_field_int64 = parquet_field_int; +using parquet_field_int8 = parquet_field_int; +using parquet_field_int32 = parquet_field_int; +using parquet_field_int64 = parquet_field_int; /** * @brief Functor to read a vector of integers from CompactProtocolReader @@ -183,7 +191,7 @@ struct parquet_field_int_list : public parquet_field_list { } }; -using parquet_field_int64_list = parquet_field_int_list; +using parquet_field_int64_list = parquet_field_int_list; /** * @brief Functor to read a string from CompactProtocolReader @@ -199,7 +207,7 @@ class parquet_field_string : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_BINARY); + assert_field_type(field_type, FieldType::BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n < static_cast(cpr->m_end - cpr->m_cur), "string length mismatch"); @@ -214,7 +222,7 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +struct parquet_field_string_list : public parquet_field_list { parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { @@ -241,7 +249,7 @@ class parquet_field_enum : public parquet_field { parquet_field_enum(int f, Enum& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_I32); + assert_field_type(field_type, FieldType::I32); val = static_cast(cpr->get_i32()); } }; @@ -253,8 +261,9 @@ class parquet_field_enum : public parquet_field { * enum fails */ template -struct parquet_field_enum_list : public parquet_field_list { - parquet_field_enum_list(int f, std::vector& v) : parquet_field_list(f, v) +struct parquet_field_enum_list : public parquet_field_list { + parquet_field_enum_list(int f, std::vector& v) + : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { this->val[i] = static_cast(cpr->get_i32()); @@ -278,7 +287,7 @@ class parquet_field_struct : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); cpr->read(&val); } }; @@ -324,7 +333,7 @@ class parquet_field_union_enumerator : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); cpr->skip_struct_field(field_type); val = static_cast(field()); } @@ -337,8 +346,9 @@ class parquet_field_union_enumerator : public parquet_field { * struct fails */ template -struct parquet_field_struct_list : public parquet_field_list { - parquet_field_struct_list(int f, std::vector& v) : parquet_field_list(f, v) +struct parquet_field_struct_list : public parquet_field_list { + parquet_field_struct_list(int f, std::vector& v) + : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { cpr->read(&this->val[i]); @@ -361,7 +371,7 @@ class parquet_field_binary : public parquet_field { inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_BINARY); + assert_field_type(field_type, FieldType::BINARY); auto const n = cpr->get_u32(); CUDF_EXPECTS(n <= static_cast(cpr->m_end - cpr->m_cur), "binary length mismatch"); @@ -377,7 +387,8 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list : public parquet_field_list, ST_FLD_BINARY> { +struct parquet_field_binary_list + : public parquet_field_list, FieldType::BINARY> { parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) { @@ -404,7 +415,7 @@ class parquet_field_struct_blob : public parquet_field { parquet_field_struct_blob(int f, std::vector& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { - assert_field_type(field_type, ST_FLD_STRUCT); + assert_field_type(field_type, FieldType::STRUCT); uint8_t const* const start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } @@ -439,24 +450,25 @@ class parquet_field_optional : public parquet_field { */ void CompactProtocolReader::skip_struct_field(int t, int depth) { - switch (t) { - case ST_FLD_TRUE: - case ST_FLD_FALSE: break; - case ST_FLD_I16: - case ST_FLD_I32: - case ST_FLD_I64: get_u64(); break; - case ST_FLD_BYTE: skip_bytes(1); break; - case ST_FLD_DOUBLE: skip_bytes(8); break; - case ST_FLD_BINARY: skip_bytes(get_u32()); break; - case ST_FLD_LIST: [[fallthrough]]; - case ST_FLD_SET: { + auto const t_enum = static_cast(t); + switch (t_enum) { + case FieldType::BOOLEAN_TRUE: + case FieldType::BOOLEAN_FALSE: break; + case FieldType::I16: + case FieldType::I32: + case FieldType::I64: get_u64(); break; + case FieldType::I8: skip_bytes(1); break; + case FieldType::DOUBLE: skip_bytes(8); break; + case FieldType::BINARY: skip_bytes(get_u32()); break; + case FieldType::LIST: + case FieldType::SET: { auto const [t, n] = get_listh(); CUDF_EXPECTS(depth <= 10, "struct nesting too deep"); for (uint32_t i = 0; i < n; i++) { skip_struct_field(t, depth + 1); } } break; - case ST_FLD_STRUCT: + case FieldType::STRUCT: for (;;) { int const c = getb(); t = c & 0xf; diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index f857b75f707..d610ec6c546 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -268,39 +268,40 @@ uint32_t CompactProtocolFieldWriter::put_int(int64_t v) return put_uint(((v ^ -s) << 1) + s); } -void CompactProtocolFieldWriter::put_field_header(int f, int cur, int t) +void CompactProtocolFieldWriter::put_field_header(int f, int cur, FieldType t) { if (f > cur && f <= cur + 15) - put_byte(((f - cur) << 4) | t); + put_packed_type_byte(f - cur, t); else { - put_byte(t); + put_byte(static_cast(t)); put_int(f); } } inline void CompactProtocolFieldWriter::field_bool(int field, bool b) { - put_field_header(field, current_field_value, b ? ST_FLD_TRUE : ST_FLD_FALSE); + put_field_header( + field, current_field_value, b ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val) { - put_field_header(field, current_field_value, ST_FLD_BYTE); + put_field_header(field, current_field_value, FieldType::I8); put_byte(val); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int(int field, int32_t val) { - put_field_header(field, current_field_value, ST_FLD_I32); + put_field_header(field, current_field_value, FieldType::I32); put_int(val); current_field_value = field; } inline void CompactProtocolFieldWriter::field_int(int field, int64_t val) { - put_field_header(field, current_field_value, ST_FLD_I64); + put_field_header(field, current_field_value, FieldType::I64); put_int(val); current_field_value = field; } @@ -309,8 +310,8 @@ template <> inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte(static_cast((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I64)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::I64); if (val.size() >= 0xfUL) { put_uint(val.size()); } for (auto const v : val) { put_int(v); @@ -321,8 +322,8 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, template inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte(static_cast((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I32)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::I32); if (val.size() >= 0xfUL) { put_uint(val.size()); } for (auto const& v : val) { put_int(static_cast(v)); @@ -333,7 +334,7 @@ inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); if constexpr (not std::is_empty_v) { writer.write(val); // write the struct if it's not empty } else { @@ -344,7 +345,7 @@ inline void CompactProtocolFieldWriter::field_struct(int field, T const& val) inline void CompactProtocolFieldWriter::field_empty_struct(int field) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); put_byte(0); // add a stop field current_field_value = field; } @@ -352,8 +353,8 @@ inline void CompactProtocolFieldWriter::field_empty_struct(int field) template inline void CompactProtocolFieldWriter::field_struct_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_STRUCT)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::STRUCT); if (val.size() >= 0xf) put_uint(val.size()); for (auto& v : val) { writer.write(v); @@ -370,7 +371,7 @@ inline size_t CompactProtocolFieldWriter::value() inline void CompactProtocolFieldWriter::field_struct_blob(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_STRUCT); + put_field_header(field, current_field_value, FieldType::STRUCT); put_byte(val.data(), static_cast(val.size())); put_byte(0); current_field_value = field; @@ -378,7 +379,7 @@ inline void CompactProtocolFieldWriter::field_struct_blob(int field, inline void CompactProtocolFieldWriter::field_binary(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_BINARY); + put_field_header(field, current_field_value, FieldType::BINARY); put_uint(val.size()); put_byte(val.data(), static_cast(val.size())); current_field_value = field; @@ -386,7 +387,7 @@ inline void CompactProtocolFieldWriter::field_binary(int field, std::vector(val.data()), static_cast(val.size())); @@ -396,8 +397,8 @@ inline void CompactProtocolFieldWriter::field_string(int field, std::string cons inline void CompactProtocolFieldWriter::field_string_list(int field, std::vector const& val) { - put_field_header(field, current_field_value, ST_FLD_LIST); - put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_BINARY)); + put_field_header(field, current_field_value, FieldType::LIST); + put_packed_type_byte(val.size(), FieldType::BINARY); if (val.size() >= 0xf) put_uint(val.size()); for (auto& v : val) { put_uint(v.size()); diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index a2ed0f1f4dc..2ed7c078f8b 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,14 @@ class CompactProtocolFieldWriter { uint32_t put_int(int64_t v); - void put_field_header(int f, int cur, int t); + template + void put_packed_type_byte(T high_bits, FieldType t) + { + uint8_t const clamped_high_bits = std::min(std::max(high_bits, T{0}), T{0xf}); + put_byte((clamped_high_bits << 4) | static_cast(t)); + } + + void put_field_header(int f, int cur, FieldType t); inline void field_bool(int field, bool b); diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 8e1c0682ffd..e16551024d1 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2207,13 +2207,14 @@ inline __device__ uint8_t* cpw_put_int64(uint8_t* p, int64_t v) return cpw_put_uint64(p, (v ^ -s) * 2 + s); } -inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, int t) +inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, FieldType t) { + auto const t_num = static_cast(t); if (f > cur && f <= cur + 15) { - *p++ = ((f - cur) << 4) | t; + *p++ = ((f - cur) << 4) | t_num; return p; } else { - *p++ = t; + *p++ = t_num; return cpw_put_int32(p, f); } } @@ -2231,7 +2232,7 @@ class header_encoder { inline __device__ void field_struct_begin(int field) { current_header_ptr = - cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_STRUCT); + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::STRUCT); current_field_index = 0; } @@ -2241,11 +2242,13 @@ class header_encoder { current_field_index = field; } - inline __device__ void field_list_begin(int field, size_t len, int type) + inline __device__ void field_list_begin(int field, size_t len, FieldType type) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_LIST); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST); + auto const t_num = static_cast(type); current_header_ptr = cpw_put_uint8( - current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | type)); + current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | t_num)); if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); } current_field_index = 0; } @@ -2254,7 +2257,9 @@ class header_encoder { inline __device__ void put_bool(bool value) { - current_header_ptr = cpw_put_uint8(current_header_ptr, value ? ST_FLD_TRUE : ST_FLD_FALSE); + auto const type_byte = + static_cast(value ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); + current_header_ptr = cpw_put_uint8(current_header_ptr, type_byte); } inline __device__ void put_binary(void const* value, uint32_t length) @@ -2272,15 +2277,18 @@ class header_encoder { inline __device__ void field_bool(int field, bool value) { - current_header_ptr = cpw_put_fldh( - current_header_ptr, field, current_field_index, value ? ST_FLD_TRUE : ST_FLD_FALSE); + current_header_ptr = cpw_put_fldh(current_header_ptr, + field, + current_field_index, + value ? FieldType::BOOLEAN_TRUE : FieldType::BOOLEAN_FALSE); current_field_index = field; } template inline __device__ void field_int32(int field, T value) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_I32); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::I32); current_header_ptr = cpw_put_int32(current_header_ptr, static_cast(value)); current_field_index = field; } @@ -2288,7 +2296,8 @@ class header_encoder { template inline __device__ void field_int64(int field, T value) { - current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_I64); + current_header_ptr = + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::I64); current_header_ptr = cpw_put_int64(current_header_ptr, static_cast(value)); current_field_index = field; } @@ -2296,7 +2305,7 @@ class header_encoder { inline __device__ void field_binary(int field, void const* value, uint32_t length) { current_header_ptr = - cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_BINARY); + cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::BINARY); current_header_ptr = cpw_put_uint32(current_header_ptr, length); memcpy(current_header_ptr, value, length); current_header_ptr += length; @@ -2868,13 +2877,13 @@ __global__ void __launch_bounds__(1) : align8(ck_g->column_index_blob + ck_g->column_index_size - column_index_truncate_length); // null_pages - encoder.field_list_begin(1, num_data_pages, ST_FLD_TRUE); + encoder.field_list_begin(1, num_data_pages, FieldType::BOOLEAN_TRUE); for (uint32_t page = first_data_page; page < num_pages; page++) { encoder.put_bool(column_stats[pageidx + page].non_nulls == 0); } encoder.field_list_end(1); // min_values - encoder.field_list_begin(2, num_data_pages, ST_FLD_BINARY); + encoder.field_list_begin(2, num_data_pages, FieldType::BINARY); for (uint32_t page = first_data_page; page < num_pages; page++) { auto const [min_ptr, min_size] = get_extremum(&column_stats[pageidx + page].min_value, col_g.stats_dtype, @@ -2885,7 +2894,7 @@ __global__ void __launch_bounds__(1) } encoder.field_list_end(2); // max_values - encoder.field_list_begin(3, num_data_pages, ST_FLD_BINARY); + encoder.field_list_begin(3, num_data_pages, FieldType::BINARY); for (uint32_t page = first_data_page; page < num_pages; page++) { auto const [max_ptr, max_size] = get_extremum(&column_stats[pageidx + page].max_value, col_g.stats_dtype, @@ -2902,7 +2911,7 @@ __global__ void __launch_bounds__(1) col_g.converted_type, num_pages - first_data_page)); // null_counts - encoder.field_list_begin(5, num_data_pages, ST_FLD_I64); + encoder.field_list_begin(5, num_data_pages, FieldType::I64); for (uint32_t page = first_data_page; page < num_pages; page++) { encoder.put_int64(column_stats[pageidx + page].null_count); } @@ -2918,7 +2927,7 @@ __global__ void __launch_bounds__(1) // optionally encode histograms and sum var_bytes. if (cd->max_rep_level > REP_LVL_HIST_CUTOFF) { - encoder.field_list_begin(6, num_data_pages * (cd->max_rep_level + 1), ST_FLD_I64); + encoder.field_list_begin(6, num_data_pages * (cd->max_rep_level + 1), FieldType::I64); thrust::for_each(thrust::seq, page_start, page_end, [&] __device__(auto const& page) { for (int i = 0; i < cd->max_rep_level + 1; i++) { encoder.put_int64(page.rep_histogram[i]); @@ -2929,7 +2938,7 @@ __global__ void __launch_bounds__(1) } if (cd->max_def_level > DEF_LVL_HIST_CUTOFF) { - encoder.field_list_begin(7, num_data_pages * (cd->max_def_level + 1), ST_FLD_I64); + encoder.field_list_begin(7, num_data_pages * (cd->max_def_level + 1), FieldType::I64); thrust::for_each(thrust::seq, page_start, page_end, [&] __device__(auto const& page) { for (int i = 0; i < cd->max_def_level + 1; i++) { encoder.put_int64(page.def_histogram[i]); diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 36157f725e3..cc3f584422d 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -114,28 +114,28 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) field_type = c & 0xf; if (!(c & 0xf0)) get_i32(bs); } - switch (field_type) { - case ST_FLD_TRUE: - case ST_FLD_FALSE: break; - case ST_FLD_I16: - case ST_FLD_I32: - case ST_FLD_I64: get_u32(bs); break; - case ST_FLD_BYTE: skip_bytes(bs, 1); break; - case ST_FLD_DOUBLE: skip_bytes(bs, 8); break; - case ST_FLD_BINARY: skip_bytes(bs, get_u32(bs)); break; - case ST_FLD_LIST: - case ST_FLD_SET: { // NOTE: skipping a list of lists is not handled + switch (static_cast(field_type)) { + case FieldType::BOOLEAN_TRUE: + case FieldType::BOOLEAN_FALSE: break; + case FieldType::I16: + case FieldType::I32: + case FieldType::I64: get_u32(bs); break; + case FieldType::I8: skip_bytes(bs, 1); break; + case FieldType::DOUBLE: skip_bytes(bs, 8); break; + case FieldType::BINARY: skip_bytes(bs, get_u32(bs)); break; + case FieldType::LIST: + case FieldType::SET: { // NOTE: skipping a list of lists is not handled auto const c = getb(bs); int n = c >> 4; if (n == 0xf) { n = get_u32(bs); } field_type = c & 0xf; - if (field_type == ST_FLD_STRUCT) { + if (static_cast(field_type) == FieldType::STRUCT) { struct_depth += n; } else { rep_cnt = n; } } break; - case ST_FLD_STRUCT: struct_depth++; break; + case FieldType::STRUCT: struct_depth++; break; } } while (rep_cnt || struct_depth); } @@ -180,7 +180,7 @@ struct ParquetFieldInt32 { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = get_i32(bs); - return (field_type != ST_FLD_I32); + return (static_cast(field_type) != FieldType::I32); } }; @@ -199,7 +199,7 @@ struct ParquetFieldEnum { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = static_cast(get_i32(bs)); - return (field_type != ST_FLD_I32); + return (static_cast(field_type) != FieldType::I32); } }; @@ -218,7 +218,7 @@ struct ParquetFieldStruct { inline __device__ bool operator()(byte_stream_s* bs, int field_type) { - return ((field_type != ST_FLD_STRUCT) || !op(bs)); + return ((static_cast(field_type) != FieldType::STRUCT) || !op(bs)); } }; diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 50736197eb9..a680e44f360 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,19 +140,20 @@ enum BoundaryOrder { /** * @brief Thrift compact protocol struct field types */ -enum FieldType { - ST_FLD_TRUE = 1, - ST_FLD_FALSE = 2, - ST_FLD_BYTE = 3, - ST_FLD_I16 = 4, - ST_FLD_I32 = 5, - ST_FLD_I64 = 6, - ST_FLD_DOUBLE = 7, - ST_FLD_BINARY = 8, - ST_FLD_LIST = 9, - ST_FLD_SET = 10, - ST_FLD_MAP = 11, - ST_FLD_STRUCT = 12, +enum class FieldType : uint8_t { + BOOLEAN_TRUE = 1, + BOOLEAN_FALSE = 2, + I8 = 3, + I16 = 4, + I32 = 5, + I64 = 6, + DOUBLE = 7, + BINARY = 8, + LIST = 9, + SET = 10, + MAP = 11, + STRUCT = 12, + UUID = 13, }; } // namespace cudf::io::parquet::detail From 8df33eed224bf33ad6013179459dfe41e5f26b2a Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 9 Jan 2024 18:47:38 -0600 Subject: [PATCH 03/23] Expand JIT groupby test suite (#13813) This PR reorganizes and expands the test suite for groupby apply functions using the JIT engine to include nan cases and cases where the groups are larger than a single thread block. Authors: - https://github.com/brandon-b-miller Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/13813 --- python/cudf/cudf/tests/test_groupby.py | 371 ++++++++++++++++--------- 1 file changed, 238 insertions(+), 133 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 57aa6e72eae..b46949faa06 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import collections import datetime @@ -392,19 +392,70 @@ def emulate(df): @pytest.fixture(scope="module") -def groupby_jit_data(): - np.random.seed(0) +def groupby_jit_data_small(): + """ + Return a small dataset for testing JIT Groupby Apply. The dataframe + contains 4 groups of size 1, 2, 3, 4 as well as an additional key + column that can be used to test subgroups within groups. This data + is useful for smoke testing basic numeric results + """ + rng = np.random.default_rng(42) df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) - df["val3"] = np.random.randint(0, 10, nelem) - df["val4"] = np.random.randint(0, 10, nelem) + key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4 + key2 = [1, 2] * 5 + df["key1"] = key1 + df["key2"] = key2 + + df["val1"] = rng.integers(0, 10, len(key1)) + df["val2"] = rng.integers(0, 10, len(key1)) + + # randomly permute data + df = df.sample(frac=1, ignore_index=True) return df +@pytest.fixture(scope="module") +def groupby_jit_data_large(groupby_jit_data_small): + """ + Larger version of groupby_jit_data_small which contains enough data + to require more than one block per group. This data is useful for + testing if JIT GroupBy algorithms scale to larger dastasets without + manifesting numerical issues such as overflow. + """ + max_tpb = 1024 + factor = ( + max_tpb + 1 + ) # bigger than a block but not always an exact multiple + df = cudf.concat([groupby_jit_data_small] * factor) + + return df + + +@pytest.fixture(scope="module") +def groupby_jit_data_nans(groupby_jit_data_small): + """ + Returns a modified version of groupby_jit_data_small which contains + nan values. + """ + + df = groupby_jit_data_small.sort_values(["key1", "key2"]) + df["val1"] = df["val1"].astype("float64") + df["val1"][::2] = np.nan + df = df.sample(frac=1, ignore_index=True) + return df + + +@pytest.fixture(scope="module") +def groupby_jit_datasets( + groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans +): + return { + "small": groupby_jit_data_small, + "large": groupby_jit_data_large, + "nans": groupby_jit_data_nans, + } + + def run_groupby_apply_jit_test(data, func, keys, *args): expect_groupby_obj = data.to_pandas().groupby(keys) got_groupby_obj = data.groupby(keys) @@ -415,6 +466,30 @@ def run_groupby_apply_jit_test(data, func, keys, *args): assert_groupby_results_equal(cudf_jit_result, pandas_result) +def groupby_apply_jit_reductions_test_inner(func, data, dtype): + # ideally we'd just have: + # lambda group: getattr(group, func)() + # but the current kernel caching mechanism relies on pickle which + # does not play nice with local functions. What's below uses + # exec as a workaround to write the test functions dynamically + + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +# test unary reductions @pytest.mark.parametrize( "dtype", SUPPORTED_GROUPBY_NUMPY_TYPES, @@ -423,13 +498,41 @@ def run_groupby_apply_jit_test(data, func, keys, *args): @pytest.mark.parametrize( "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) -def test_groupby_apply_jit_reductions(func, groupby_jit_data, dtype): - # ideally we'd just have: - # lambda group: getattr(group, func)() - # but the current kernel caching mechanism relies on pickle which - # does not play nice with local functions. What's below uses - # exec as a workaround to write the test functions dynamically +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_unary_reductions( + func, dtype, dataset, groupby_jit_datasets +): + dataset = groupby_jit_datasets[dataset] + + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + + +# test unary reductions for special values +def groupby_apply_jit_reductions_special_vals_inner( + func, data, dtype, special_val +): + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +# test unary index reductions for special values +def groupby_apply_jit_idx_reductions_special_vals_inner( + func, data, dtype, special_val +): funcstr = textwrap.dedent( f""" def func(df): @@ -440,36 +543,129 @@ def func(df): exec(funcstr, lcl) func = lcl["func"] - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - groupby_jit_data["val2"] = groupby_jit_data["val2"].astype(dtype) + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + run_groupby_apply_jit_test(data, func, ["key1"]) -@pytest.mark.parametrize("dtype", SUPPORTED_GROUPBY_NUMPY_TYPES) -def test_groupby_apply_jit_correlation(groupby_jit_data, dtype): +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) +@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_reductions_special_vals( + func, dtype, dataset, groupby_jit_datasets, special_val +): + dataset = groupby_jit_datasets[dataset] + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) - groupby_jit_data["val3"] = groupby_jit_data["val3"].astype(dtype) - groupby_jit_data["val4"] = groupby_jit_data["val4"].astype(dtype) - keys = ["key1", "key2"] +@pytest.mark.parametrize("dtype", ["float64"]) +@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) +@pytest.mark.parametrize( + "special_val", + [ + pytest.param( + np.nan, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13832" + ), + ), + np.inf, + -np.inf, + ], +) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +def test_groupby_apply_jit_idx_reductions_special_vals( + func, dtype, dataset, groupby_jit_datasets, special_val +): + dataset = groupby_jit_datasets[dataset] + groupby_apply_jit_idx_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) + + +@pytest.mark.parametrize("dtype", ["int32"]) +def test_groupby_apply_jit_sum_integer_overflow(dtype): + max = np.iinfo(dtype).max + + data = DataFrame( + { + "a": [0, 0, 0], + "b": [max, max, max], + } + ) def func(group): - return group["val3"].corr(group["val4"]) + return group["b"].sum() - if dtype.kind == "f": + run_groupby_apply_jit_test(data, func, ["a"]) + + +@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) +@pytest.mark.parametrize( + "dataset", + [ + pytest.param( + "small", + marks=[ + pytest.mark.filterwarnings( + "ignore:Degrees of Freedom <= 0 for slice" + ), + pytest.mark.filterwarnings( + "ignore:divide by zero encountered in divide" + ), + ], + ), + "large", + ], +) +def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): + + dataset = groupby_jit_datasets[dataset] + + dataset["val1"] = dataset["val1"].astype(dtype) + dataset["val2"] = dataset["val2"].astype(dtype) + + keys = ["key1"] + + def func(group): + return group["val1"].corr(group["val2"]) + + if np.dtype(dtype).kind == "f": + # Correlation of floating types is not yet supported: + # https://github.com/rapidsai/cudf/issues/13839 m = ( f"Series.corr\\(Series\\) is not " f"supported for \\({dtype}, {dtype}\\)" ) with pytest.raises(UDFError, match=m): - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(dataset, func, keys) return - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(dataset, func, keys) + + +@pytest.mark.parametrize("dtype", ["int32", "int64"]) +def test_groupby_apply_jit_correlation_zero_variance(dtype): + # pearson correlation is undefined when the variance of either + # variable is zero. This test ensures that the jit implementation + # returns the same result as pandas in this case. + data = DataFrame( + {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} + ) + + def func(group): + return group["b"].corr(group["c"]) + + run_groupby_apply_jit_test(data, func, ["a"]) @pytest.mark.parametrize("op", unary_ops) -def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data, op): +def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data_small, op): keys = ["key1"] def func(group): @@ -479,11 +675,13 @@ def func(group): UDFError, match=f"{op.__name__}\\(Series\\) is not supported by JIT GroupBy", ): - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) @pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error(groupby_jit_data, op): +def test_groupby_apply_jit_invalid_binary_ops_error( + groupby_jit_data_small, op +): keys = ["key1"] def func(group): @@ -493,10 +691,10 @@ def func(group): UDFError, match=f"{op.__name__}\\(Series, Series\\) is not supported", ): - run_groupby_apply_jit_test(groupby_jit_data, func, keys) + run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) -def test_groupby_apply_jit_no_df_ops(groupby_jit_data): +def test_groupby_apply_jit_no_df_ops(groupby_jit_data_small): # DataFrame level operations are not yet supported. def func(group): return group.sum() @@ -505,7 +703,7 @@ def func(group): UDFError, match="JIT GroupBy.apply\\(\\) does not support DataFrame.sum\\(\\)", ): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1"]) @pytest.mark.parametrize("dtype", ["uint8", "str"]) @@ -529,101 +727,6 @@ def func(group): run_groupby_apply_jit_test(df, func, ["a"]) -@pytest.mark.parametrize("dtype", ["int32", "int64"]) -def test_groupby_apply_jit_correlation_zero_variance(dtype): - # pearson correlation is undefined when the variance of either - # variable is zero. This test ensures that the jit implementation - # returns the same result as pandas in this case. - data = DataFrame( - {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} - ) - - def func(group): - return group["b"].corr(group["c"]) - - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("dtype", ["int32"]) -def test_groupby_apply_jit_sum_integer_overflow(dtype): - max = np.iinfo(dtype).max - - data = DataFrame( - { - "a": [0, 0, 0], - "b": [max, max, max], - } - ) - - def func(group): - return group["b"].sum() - - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) -@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) -def test_groupby_apply_jit_reductions_special_vals( - func, groupby_jit_data, dtype, special_val -): - # dynamically generate to avoid pickling error. - # see test_groupby_apply_jit_reductions for details. - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - groupby_jit_data["val1"] = special_val - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1"]) - - -@pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) -@pytest.mark.parametrize( - "special_val", - [ - pytest.param( - np.nan, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/13832" - ), - ), - np.inf, - -np.inf, - ], -) -def test_groupby_apply_jit_idx_reductions_special_vals( - func, groupby_jit_data, dtype, special_val -): - # dynamically generate to avoid pickling error. - # see test_groupby_apply_jit_reductions for details. - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - groupby_jit_data["val1"] = special_val - groupby_jit_data["val1"] = groupby_jit_data["val1"].astype(dtype) - - expect = groupby_jit_data.to_pandas().groupby("key1").apply(func) - got = groupby_jit_data.groupby("key1").apply(func, engine="jit") - - assert_eq(expect, got, check_dtype=False) - - @pytest.mark.parametrize( "func", [ @@ -632,8 +735,8 @@ def func(df): lambda df: df["val1"].mean() + df["val2"].std(), ], ) -def test_groupby_apply_jit_basic(func, groupby_jit_data): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"]) +def test_groupby_apply_jit_basic(func, groupby_jit_data_small): + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) def create_test_groupby_apply_jit_args_params(): @@ -652,8 +755,10 @@ def f3(df, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_jit_args_params() ) -def test_groupby_apply_jit_args(func, args, groupby_jit_data): - run_groupby_apply_jit_test(groupby_jit_data, func, ["key1", "key2"], *args) +def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): + run_groupby_apply_jit_test( + groupby_jit_data_small, func, ["key1", "key2"], *args + ) def test_groupby_apply_jit_block_divergence(): From 6a23775db29dc4b38820994297c94201c9287aaf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Jan 2024 15:28:34 -1000 Subject: [PATCH 04/23] Ensure column.fillna signatures are consistent (#14724) Aligns the definitions of `Columns.fillna` among all subclasses. `dtype` looks to only needed in certain instances to cast the fill value so can do that separately. A `fill_nan` can be avoided with its single usage in a `can_cast` routine by checking for nan first Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14724 --- python/cudf/cudf/core/column/categorical.py | 9 ++++---- python/cudf/cudf/core/column/column.py | 9 ++++---- python/cudf/cudf/core/column/datetime.py | 5 +++-- python/cudf/cudf/core/column/decimal.py | 22 +++++++++---------- python/cudf/cudf/core/column/numerical.py | 19 ++++++++-------- python/cudf/cudf/core/column/string.py | 12 +++++------ python/cudf/cudf/core/column/timedelta.py | 24 ++++++++++----------- python/cudf/cudf/tests/test_timedelta.py | 3 +-- 8 files changed, 50 insertions(+), 53 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7036a9ee870..c7e7cf2bf7e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -1236,9 +1236,8 @@ def notnull(self) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Any = None, - dtype: Optional[Dtype] = None, - ) -> CategoricalColumn: + method: Optional[str] = None, + ) -> Self: """ Fill null values with *fill_value* """ @@ -1276,7 +1275,7 @@ def fillna( self.codes.dtype ) - return super().fillna(value=fill_value, method=method) + return super().fillna(fill_value, method=method) def indices_of( self, value: ScalarLike diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 296fd6a41b0..440ac855691 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -710,16 +710,15 @@ def _check_scatter_key_length( def fillna( self, - value: Any = None, + fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ return libcudf.replace.replace_nulls( - input_col=self, replacement=value, method=method, dtype=dtype + input_col=self, replacement=fill_value, method=method )._with_type_metadata(self.dtype) def isnull(self) -> ColumnBase: @@ -929,7 +928,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(rhs.null_count > 0, dtype=bool) + result = result.fillna(cudf.Scalar(rhs.null_count > 0)) return result def as_mask(self) -> Buffer: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 466ea3220c8..5aa75365389 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -598,12 +599,12 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> DatetimeColumn: + ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) if is_scalar(fill_value): + # TODO: Add cast checking like TimedeltaColumn.fillna if not isinstance(fill_value, cudf.Scalar): fill_value = cudf.Scalar(fill_value, dtype=self.dtype) else: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 02e03f92745..299875f0091 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import warnings from decimal import Decimal @@ -7,6 +7,7 @@ import cupy as cp import numpy as np import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -125,29 +126,28 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): def fillna( self, - value: Any = None, + fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ): + ) -> Self: """Fill null values with ``value``. Returns a copy with null filled. """ - if isinstance(value, (int, Decimal)): - value = cudf.Scalar(value, dtype=self.dtype) + if isinstance(fill_value, (int, Decimal)): + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) elif ( - isinstance(value, DecimalBaseColumn) - or isinstance(value, cudf.core.column.NumericalColumn) - and is_integer_dtype(value.dtype) + isinstance(fill_value, DecimalBaseColumn) + or isinstance(fill_value, cudf.core.column.NumericalColumn) + and is_integer_dtype(fill_value.dtype) ): - value = value.astype(self.dtype) + fill_value = fill_value.astype(self.dtype) else: raise TypeError( "Decimal columns only support using fillna with decimal and " "integer values" ) - return super().fillna(value=value, method=method) + return super().fillna(fill_value, method=method) def normalize_binop_value(self, other): if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f40886bf153..e848c86897f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -16,6 +16,7 @@ import cupy as cp import numpy as np import pandas as pd +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -291,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - def nans_to_nulls(self: NumericalColumn) -> NumericalColumn: + def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. if self.dtype.kind != "f" or self.nan_count == 0: return self @@ -533,13 +534,11 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - fill_nan: bool = True, - ) -> NumericalColumn: + ) -> Self: """ Fill null values with *fill_value* """ - col = self.nans_to_nulls() if fill_nan else self + col = self.nans_to_nulls() if col.null_count == 0: return col @@ -574,8 +573,8 @@ def fillna( if not (new_fill_value == fill_value).all(): raise TypeError( f"Cannot safely cast non-equivalent " - f"{col.dtype.type.__name__} to " - f"{cudf.dtype(dtype).type.__name__}" + f"{fill_value.dtype.type.__name__} to " + f"{col.dtype.type.__name__}" ) fill_value = new_fill_value else: @@ -652,12 +651,14 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: # want to cast float to int: elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}: + if self.nan_count > 0: + return False iinfo = np.iinfo(to_dtype) min_, max_ = iinfo.min, iinfo.max # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): - filled = self.fillna(0, fill_nan=False) + filled = self.fillna(0) return (cudf.Series(filled) % 1 == 0).all() else: return False diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7bf81f3e2d3..06b5ac31ca6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -21,6 +21,7 @@ import pandas as pd import pyarrow as pa from numba import cuda +from typing_extensions import Self import cudf import cudf.api.types @@ -5824,17 +5825,16 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> StringColumn: + ) -> Self: if fill_value is not None: if not is_scalar(fill_value): fill_value = column.as_column(fill_value, dtype=self.dtype) elif cudf._lib.scalar._is_null_host_scalar(fill_value): # Trying to fill with value? Return copy. return self.copy(deep=True) - return super().fillna(value=fill_value, dtype="object") - else: - return super().fillna(method=method) + else: + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) + return super().fillna(fill_value, method=method) def normalize_binop_value( self, other diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 572b3b894dc..d664b0f18df 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from typing_extensions import Self import cudf from cudf import _lib as libcudf @@ -281,24 +282,21 @@ def fillna( self, fill_value: Any = None, method: Optional[str] = None, - dtype: Optional[Dtype] = None, - ) -> TimeDeltaColumn: + ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): return self.copy(deep=True) - col: ColumnBase = self if is_scalar(fill_value): - if isinstance(fill_value, np.timedelta64): - dtype = determine_out_dtype(self.dtype, fill_value.dtype) - fill_value = fill_value.astype(dtype) - col = col.astype(dtype) - if not isinstance(fill_value, cudf.Scalar): - fill_value = cudf.Scalar(fill_value, dtype=dtype) + fill_value = cudf.Scalar(fill_value) + dtype = determine_out_dtype(self.dtype, fill_value.dtype) + fill_value = fill_value.astype(dtype) + if self.dtype != dtype: + return cast( + Self, self.astype(dtype).fillna(fill_value, method) + ) else: fill_value = column.as_column(fill_value, nan_as_null=False) - return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value)) - else: - return super().fillna(method=method) + return super().fillna(fill_value, method) def as_numerical_column( self, dtype: Dtype, **kwargs diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 139ce1c4ca3..d86612d3143 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import datetime import operator @@ -1024,7 +1024,6 @@ def local_assert(expected, actual): [ np.timedelta64(4, "s"), np.timedelta64(456, "D"), - np.timedelta64(46, "h"), np.timedelta64("nat"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), From fa37e13db360e0b685bc6af020aa7510f1fbbdbd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jan 2024 08:18:15 -1000 Subject: [PATCH 05/23] Replace as_numerical with as_numerical_column/codes (#14719) * For datetime/timedelta, replaced with `as_numerical_column("int64")` * For categorical, replaced with `codes` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/14719 --- python/cudf/cudf/core/column/categorical.py | 25 ++++--------- python/cudf/cudf/core/column/datetime.py | 41 +++++++++------------ python/cudf/cudf/core/column/timedelta.py | 39 +++++++++----------- python/cudf/cudf/core/dataframe.py | 8 ++-- 4 files changed, 46 insertions(+), 67 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c7e7cf2bf7e..f3f2be0cc45 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -754,7 +754,7 @@ def __contains__(self, item: ScalarLike) -> bool: self._encode(item) except ValueError: return False - return self._encode(item) in self.as_numerical + return self._encode(item) in self.codes def set_base_data(self, value): if value is not None: @@ -799,15 +799,6 @@ def children(self) -> Tuple[NumericalColumn]: self._children = (codes_column,) return self._children - @property - def as_numerical(self) -> NumericalColumn: - return cast( - cudf.core.column.NumericalColumn, - column.build_column( - data=self.codes.data, dtype=self.codes.dtype, mask=self.mask - ), - ) - @property def categories(self) -> ColumnBase: return self.dtype.categories._values @@ -925,7 +916,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: "The only binary operations supported by unordered " "categorical columns are equality and inequality." ) - return self.as_numerical._binaryop(other.as_numerical, op) + return self.codes._binaryop(other.codes, op) def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: if isinstance(other, column.ColumnBase): @@ -950,7 +941,7 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: def sort_values( self, ascending: bool = True, na_position="last" ) -> CategoricalColumn: - codes = self.as_numerical.sort_values(ascending, na_position) + codes = self.codes.sort_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories._values, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -961,7 +952,7 @@ def sort_values( return col def element_indexing(self, index: int) -> ScalarLike: - val = self.as_numerical.element_indexing(index) + val = self.codes.element_indexing(index) return self._decode(int(val)) if val is not None else val @property @@ -1053,7 +1044,7 @@ def data_array_view( return self.codes.data_array_view(mode=mode) def unique(self) -> CategoricalColumn: - codes = self.as_numerical.unique() + codes = self.codes.unique() return column.build_categorical_column( categories=self.categories, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -1280,15 +1271,15 @@ def fillna( def indices_of( self, value: ScalarLike ) -> cudf.core.column.NumericalColumn: - return self.as_numerical.indices_of(self._encode(value)) + return self.codes.indices_of(self._encode(value)) @property def is_monotonic_increasing(self) -> bool: - return bool(self.ordered) and self.as_numerical.is_monotonic_increasing + return bool(self.ordered) and self.codes.is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: - return bool(self.ordered) and self.as_numerical.is_monotonic_decreasing + return bool(self.ordered) and self.codes.is_monotonic_decreasing def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 5aa75365389..2b44b46bb9e 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -268,7 +268,9 @@ def __contains__(self, item: ScalarLike) -> bool: # np.datetime64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item_as_dt64.astype("int64") in self.as_numerical + return item_as_dt64.astype("int64") in self.as_numerical_column( + "int64" + ) @property def time_unit(self) -> str: @@ -397,19 +399,6 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: return NotImplemented - @property - def as_numerical(self) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", - column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, - ), - ) - @property def __cuda_array_interface__(self) -> Mapping[str, Any]: output = { @@ -449,9 +438,14 @@ def as_timedelta_column( def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + col = column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, ) + return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_string_column( self, dtype: Dtype, format=None, **kwargs @@ -484,7 +478,7 @@ def mean( self, skipna=None, min_count: int = 0, dtype=np.float64 ) -> ScalarLike: return pd.Timestamp( - self.as_numerical.mean( + self.as_numerical_column("int64").mean( skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -498,7 +492,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std( + self.as_numerical_column("int64").std( skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) * _unit_to_nanoseconds_conversion[self.time_unit], @@ -506,7 +500,8 @@ def std( def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: return pd.Timestamp( - self.as_numerical.median(skipna=skipna), unit=self.time_unit + self.as_numerical_column("int64").median(skipna=skipna), + unit=self.time_unit, ) def quantile( @@ -516,7 +511,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical.quantile( + result = self.as_numerical_column("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -617,12 +612,12 @@ def indices_of( ) -> cudf.core.column.NumericalColumn: value = column.as_column( pd.to_datetime(value), dtype=self.dtype - ).as_numerical - return self.as_numerical.indices_of(value) + ).as_numerical_column("int64") + return self.as_numerical_column("int64").indices_of(value) @property def is_unique(self) -> bool: - return self.as_numerical.is_unique + return self.as_numerical_column("int64").is_unique def isin(self, values: Sequence) -> ColumnBase: return cudf.core.tools.datetimes._isin_datetimelike(self, values) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index d664b0f18df..2f842130f48 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -115,7 +115,7 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool: # np.timedelta64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.view("int64") in self.as_numerical + return item.view("int64") in self.as_numerical_column("int64") @property def values(self): @@ -134,7 +134,9 @@ def to_arrow(self) -> pa.Array: self.mask_array_view(mode="read").copy_to_host() ) data = pa.py_buffer( - self.as_numerical.data_array_view(mode="read").copy_to_host() + self.as_numerical_column("int64") + .data_array_view(mode="read") + .copy_to_host() ) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( @@ -261,19 +263,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: return cudf.Scalar(other) return NotImplemented - @property - def as_numerical(self) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", - column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, - ), - ) - @property def time_unit(self) -> str: return self._time_unit @@ -301,9 +290,14 @@ def fillna( def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - return cast( - "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + col = column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, ) + return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_datetime_column( self, dtype: Dtype, **kwargs @@ -337,13 +331,14 @@ def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.mean(skipna=skipna, dtype=dtype), + self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.median(skipna=skipna), unit=self.time_unit + self.as_numerical_column("int64").median(skipna=skipna), + unit=self.time_unit, ) def isin(self, values: Sequence) -> ColumnBase: @@ -356,7 +351,7 @@ def quantile( exact: bool, return_scalar: bool, ) -> ColumnBase: - result = self.as_numerical.quantile( + result = self.as_numerical_column("int64").quantile( q=q, interpolation=interpolation, exact=exact, @@ -376,7 +371,7 @@ def sum( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. - self.as_numerical.sum( # type: ignore + self.as_numerical_column("int64").sum( # type: ignore skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, @@ -390,7 +385,7 @@ def std( ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std( + self.as_numerical_column("int64").std( skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype ), unit=self.time_unit, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 01935fec8c3..8dd81c92994 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -2379,15 +2379,13 @@ def scatter_by_map( # Convert string or categorical to integer if isinstance(map_index, cudf.core.column.StringColumn): - map_index = map_index.as_categorical_column( - "category" - ).as_numerical + map_index = map_index.as_categorical_column("category").codes warnings.warn( "Using StringColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." ) elif isinstance(map_index, cudf.core.column.CategoricalColumn): - map_index = map_index.as_numerical + map_index = map_index.codes warnings.warn( "Using CategoricalColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." From 3f19d04c1889e7114cbed9081359f3bdcb9f6e64 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 10 Jan 2024 11:11:28 -1000 Subject: [PATCH 06/23] Remove unneeded methods in Column (#14730) * `valid_count` can be composed of `null_count` or where checked `has_nulls` * `contains_na_entries` is redundant with `has_nulls` * Better typing in `searchsorted` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14730 --- python/cudf/cudf/core/_base_index.py | 16 +++++++++++----- python/cudf/cudf/core/column/categorical.py | 4 +++- python/cudf/cudf/core/column/column.py | 20 ++++++-------------- python/cudf/cudf/core/column/numerical.py | 6 +----- python/cudf/cudf/core/dataframe.py | 11 ++++++++--- python/cudf/cudf/core/frame.py | 9 +++++++-- python/cudf/cudf/core/index.py | 11 ++++++----- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_categorical.py | 4 ++-- python/cudf/cudf/tests/test_orc.py | 8 +++++--- python/cudf/cudf/utils/dtypes.py | 4 ++-- 11 files changed, 52 insertions(+), 43 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8d2506403d4..2aef77b6c99 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2,11 +2,10 @@ from __future__ import annotations -import builtins import pickle import warnings from functools import cached_property -from typing import Any, Set, Tuple +from typing import Any, Literal, Set, Tuple import pandas as pd from typing_extensions import Self @@ -1702,6 +1701,8 @@ def find_label_range(self, loc: slice) -> slice: start = loc.start stop = loc.stop step = 1 if loc.step is None else loc.step + start_side: Literal["left", "right"] + stop_side: Literal["left", "right"] if step < 0: start_side, stop_side = "right", "left" else: @@ -1725,9 +1726,9 @@ def find_label_range(self, loc: slice) -> slice: def searchsorted( self, value, - side: builtins.str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: builtins.str = "last", + na_position: Literal["first", "last"] = "last", ): """Find index where elements should be inserted to maintain order @@ -1754,7 +1755,12 @@ def searchsorted( """ raise NotImplementedError - def get_slice_bound(self, label, side: builtins.str, kind=None) -> int: + def get_slice_bound( + self, + label, + side: Literal["left", "right"], + kind: Literal["ix", "loc", "getitem", None] = None, + ) -> int: """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f3f2be0cc45..59fd4631067 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1372,7 +1372,9 @@ def _concat( # improved as the concatenation API is solidified. # Find the first non-null column: - head = next((obj for obj in objs if obj.valid_count), objs[0]) + head = next( + (obj for obj in objs if not obj.null_count != len(obj)), objs[0] + ) # Combine and de-dupe the categories cats = column.concat_columns([o.categories for o in objs]).unique() diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 440ac855691..6af39dd3558 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -12,6 +12,7 @@ Any, Dict, List, + Literal, MutableSequence, Optional, Sequence, @@ -428,11 +429,6 @@ def _fill( def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: return libcudf.copying.shift(self, offset, fill_value) - @property - def valid_count(self) -> int: - """Number of non-null values""" - return len(self) - self.null_count - @property def nullmask(self) -> Buffer: """The gpu buffer for the null-mask""" @@ -1159,9 +1155,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def searchsorted( self, value, - side: str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ) -> Self: if not isinstance(value, ColumnBase) or value.dtype != self.dtype: raise ValueError( @@ -1304,10 +1300,6 @@ def _reduce( return libcudf.reduce.reduce(op, preprocessed, **kwargs) return preprocessed - @property - def contains_na_entries(self) -> bool: - return self.null_count != 0 - def _process_for_reduction( self, skipna: Optional[bool] = None, min_count: int = 0 ) -> Union[ColumnBase, ScalarLike]: @@ -2742,7 +2734,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns - not_null_col_dtypes = [o.dtype for o in objs if o.valid_count] + not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( _is_non_decimal_numeric_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64) @@ -2754,13 +2746,13 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: objs = [obj.astype(common_dtype) for obj in objs] # Find the first non-null column: - head = next((obj for obj in objs if obj.valid_count), objs[0]) + head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0]) for i, obj in enumerate(objs): # Check that all columns are the same type: if not is_dtype_equal(obj.dtype, head.dtype): # if all null, cast to appropriate dtype - if obj.valid_count == 0: + if obj.null_count == len(obj): objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index e848c86897f..6ef3a6abacc 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -140,7 +140,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn: else: return super().indices_of(value) - def has_nulls(self, include_nan=False): + def has_nulls(self, include_nan: bool = False) -> bool: return bool(self.null_count != 0) or ( include_nan and bool(self.nan_count != 0) ) @@ -425,10 +425,6 @@ def dropna(self, drop_nan: bool = False) -> NumericalColumn: col = self.nans_to_nulls() if drop_nan else self return drop_nulls([col])[0] - @property - def contains_na_entries(self) -> bool: - return (self.nan_count != 0) or (self.null_count != 0) - def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8dd81c92994..a3373951a06 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5997,9 +5997,14 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): axis = self._get_axis_from_axis_arg(axis) if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") - + length = len(self) return Series._from_data( - {None: [self._data[col].valid_count for col in self._data.names]}, + { + None: [ + length - self._data[col].null_count + for col in self._data.names + ] + }, as_index(self._data.names), ) @@ -8091,7 +8096,7 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): # non-null Column with the same name is found. if idx not in dtypes: dtypes[idx] = cols[idx].dtype - if cols[idx].valid_count > 0: + if cols[idx].null_count != len(cols[idx]): if idx not in non_null_columns: non_null_columns[idx] = [cols[idx]] else: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 123f13f8733..5f7a86e86d8 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -13,6 +13,7 @@ Callable, Dict, List, + Literal, MutableMapping, Optional, Tuple, @@ -882,7 +883,7 @@ def fillna( replace_val = None should_fill = ( col_name in value - and col.contains_na_entries + and col.has_nulls(include_nan=True) and not libcudf.scalar._is_null_host_scalar(replace_val) ) or method is not None if should_fill: @@ -1354,7 +1355,11 @@ def notna(self): @_cudf_nvtx_annotate def searchsorted( - self, values, side="left", ascending=True, na_position="last" + self, + values, + side: Literal["left", "right"] = "left", + ascending: bool = True, + na_position: Literal["first", "last"] = "last", ): """Find indices where elements should be inserted to maintain order diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6bc632e0a53..5c33cd09ad1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -11,6 +11,7 @@ Any, Dict, List, + Literal, MutableMapping, Optional, Sequence, @@ -233,9 +234,9 @@ def _copy_type_metadata( def searchsorted( self, value: int, - side: str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ): assert (len(self) <= 1) or ( ascending == (self._step > 0) @@ -2205,9 +2206,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): def searchsorted( self, value, - side: str = "left", + side: Literal["left", "right"] = "left", ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ): value = self.dtype.type(value) return super().searchsorted( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fcb4e77f6a5..8739a61dd8b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1710,7 +1710,7 @@ def _concat(cls, objs, axis=0, index=True): @_cudf_nvtx_annotate def valid_count(self): """Number of non-null values""" - return self._column.valid_count + return len(self) - self._column.null_count @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 49eeff01bee..52b7236b965 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import operator import string @@ -217,7 +217,7 @@ def test_categorical_masking(): got_masked = sr[got_matches] assert len(expect_masked) == len(got_masked) - assert len(expect_masked) == got_masked.valid_count + assert got_masked.null_count == 0 assert_eq(got_masked, expect_masked) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 7407da9c4ac..4630b6eef0a 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime import decimal @@ -812,7 +812,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): if "number_of_values" in file_stats[0][col]: stats_valid_count = file_stats[0][col]["number_of_values"] - actual_valid_count = gdf[col].valid_count + actual_valid_count = len(gdf[col]) - gdf[col].null_count assert normalized_equals(actual_valid_count, stats_valid_count) # compare stripe statistics with actual min/max @@ -827,7 +827,9 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): assert normalized_equals(actual_true_count, stats_true_count) if "number_of_values" in stripes_stats[stripe_idx][col]: - actual_valid_count = stripe_df[col].valid_count + actual_valid_count = ( + len(stripe_df[col]) - stripe_df[col].null_count + ) stats_valid_count = stripes_stats[stripe_idx][col][ "number_of_values" ] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index c65404445cb..72721b5197f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import datetime from collections import namedtuple @@ -401,7 +401,7 @@ def min_column_type(x, expected_type): if not isinstance(x, cudf.core.column.NumericalColumn): raise TypeError("Argument x must be of type column.NumericalColumn") - if x.valid_count == 0: + if x.null_count == len(x): return x.dtype if np.issubdtype(x.dtype, np.floating): From 1078326535c9989a2e904d78ceb708a097be989b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 10 Jan 2024 13:43:58 -0800 Subject: [PATCH 07/23] Write cuDF version in Parquet "created_by" metadata field (#14721) Populate the informational `created_by` field in the Parquet file metadata. Identifying the source of a parquet file can help with tracking down interoperability problems. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14721 --- cpp/CMakeLists.txt | 8 +++++++- cpp/src/io/parquet/writer_impl.cu | 9 ++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a7c34ca489c..cb1fdb1f557 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -658,6 +658,12 @@ set_source_files_properties( PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64" ) +set_property( + SOURCE src/io/parquet/writer_impl.cu + APPEND + PROPERTY COMPILE_DEFINITIONS "CUDF_VERSION=${PROJECT_VERSION}" +) + set_target_properties( cudf PROPERTIES BUILD_RPATH "\$ORIGIN" diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index c452f632cd6..279a814a4e1 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,6 +55,10 @@ #include #include +#ifndef CUDF_VERSION +#error "CUDF_VERSION is not defined" +#endif + namespace cudf::io::parquet::detail { using namespace cudf::io::detail; @@ -108,7 +112,7 @@ struct aggregate_writer_metadata { meta.num_rows = this->files[part].num_rows; meta.row_groups = this->files[part].row_groups; meta.key_value_metadata = this->files[part].key_value_metadata; - meta.created_by = this->created_by; + meta.created_by = "cudf version " CUDF_STRINGIFY(CUDF_VERSION); meta.column_orders = this->column_orders; return meta; } @@ -171,7 +175,6 @@ struct aggregate_writer_metadata { std::vector> column_indexes; }; std::vector files; - std::string created_by = ""; thrust::optional> column_orders = thrust::nullopt; }; From 516559e76cfed67d58dcc4c2116ed93bfcf193ca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 06:18:55 -1000 Subject: [PATCH 08/23] Explicitly pass .dtype into is_foo_dtype functions (#14657) Just promotes more explicitness when using dtype checking functions Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14657 --- python/cudf/cudf/core/_internals/where.py | 2 +- python/cudf/cudf/core/column/numerical.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/testing/testing.py | 10 +++++++--- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index f76802c8b7d..ef6b10f66c1 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -103,7 +103,7 @@ def _check_and_cast_columns_with_other( other = cudf.Scalar(other) if is_mixed_with_object_dtype(other, source_col) or ( - is_bool_dtype(source_col) and not is_bool_dtype(common_dtype) + is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype) ): raise TypeError(mixed_err) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6ef3a6abacc..148fa252fad 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -272,13 +272,13 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: out_dtype = "bool" if op in {"__and__", "__or__", "__xor__"}: - if is_float_dtype(self.dtype) or is_float_dtype(other): + if is_float_dtype(self.dtype) or is_float_dtype(other.dtype): raise TypeError( f"Operation 'bitwise {op[2:-2]}' not supported between " f"{self.dtype.type.__name__} and " f"{other.dtype.type.__name__}" ) - if is_bool_dtype(self.dtype) or is_bool_dtype(other): + if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype): out_dtype = "bool" if ( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a3373951a06..813ecc32069 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -324,7 +324,7 @@ def _getitem_tuple_arg(self, arg): tmp_arg[1], ) - if is_bool_dtype(tmp_arg[0]): + if is_bool_dtype(tmp_arg[0].dtype): df = columns_df._apply_boolean_mask( BooleanMask(tmp_arg[0], len(columns_df)) ) @@ -6032,7 +6032,7 @@ def _reduce( numeric_cols = ( name for name in self._data.names - if is_numeric_dtype(self._data[name]) + if is_numeric_dtype(self._data[name].dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: @@ -6078,7 +6078,7 @@ def _reduce( numeric_cols = ( name for name in self._data.names - if is_numeric_dtype(self._data[name]) + if is_numeric_dtype(self._data[name].dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index a45733a0f83..6c2f073b7ac 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -232,10 +232,10 @@ def assert_column_equal( elif not ( ( not dtype_can_compare_equal_to_other(left.dtype) - and is_numeric_dtype(right) + and is_numeric_dtype(right.dtype) ) or ( - is_numeric_dtype(left) + is_numeric_dtype(left.dtype) and not dtype_can_compare_equal_to_other(right.dtype) ) ): @@ -245,7 +245,11 @@ def assert_column_equal( left.isnull().values == right.isnull().values ) - if columns_equal and not check_exact and is_numeric_dtype(left): + if ( + columns_equal + and not check_exact + and is_numeric_dtype(left.dtype) + ): # non-null values must be the same columns_equal = cp.allclose( left.apply_boolean_mask( From d26ea6de90ad28e409ffeae4a3a470c41316e969 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 06:23:41 -1000 Subject: [PATCH 09/23] Fix nan_as_null not being respected when passing arrow object (#14688) Similar to https://github.com/rapidsai/cudf/pull/14687, nan was not being interpreted as null when `nan_as_null=True` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14688 --- python/cudf/cudf/core/column/column.py | 13 +++++++++++-- python/cudf/cudf/tests/test_series.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6af39dd3558..5b638b1f4ad 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,6 +25,7 @@ import numpy as np import pandas as pd import pyarrow as pa +import pyarrow.compute as pc from numba import cuda from typing_extensions import Self @@ -1997,11 +1998,19 @@ def as_column( return col elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): - if isinstance(arbitrary, pa.lib.HalfFloatArray): + if pa.types.is_float16(arbitrary.type): raise NotImplementedError( "Type casting from `float16` to `float32` is not " "yet supported in pyarrow, see: " - "https://issues.apache.org/jira/browse/ARROW-3802" + "https://github.com/apache/arrow/issues/20213" + ) + elif (nan_as_null is None or nan_as_null) and pa.types.is_floating( + arbitrary.type + ): + arbitrary = pc.if_else( + pc.is_nan(arbitrary), + pa.nulls(len(arbitrary), type=arbitrary.type), + arbitrary, ) col = ColumnBase.from_arrow(arbitrary) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 248ac201e12..8898bf70047 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2572,6 +2572,16 @@ def test_series_arrow_list_types_roundtrip(): cudf.from_pandas(pdf) +@pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) +@pytest.mark.parametrize( + "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])] +) +def test_nan_as_null_from_arrow_objects(klass, data): + result = klass(data, nan_as_null=True) + expected = klass(pa.array([None], type=pa.float64())) + assert_eq(result, expected) + + @pytest.mark.parametrize("reso", ["M", "ps"]) @pytest.mark.parametrize("typ", ["M", "m"]) def test_series_invalid_reso_dtype(reso, typ): From 85acdc640701940e47b3969b14a811f33e7faf5b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 09:58:15 -1000 Subject: [PATCH 10/23] Fix constructing Series/Index from arrow array and dtype (#14686) Previously, passing a `dtype=` argument to the constructors was ignored when passing arrow array objects Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14686 --- python/cudf/cudf/core/column/column.py | 21 +++++++++++++-------- python/cudf/cudf/tests/test_series.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5b638b1f4ad..19e76d4a95b 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2036,6 +2036,8 @@ def as_column( new_dtype = "str" col = col.astype(new_dtype) + elif dtype is not None: + col = col.astype(dtype) return col @@ -2112,6 +2114,15 @@ def as_column( arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length ) elif arbitrary.dtype.kind == "O": + if len(arbitrary) == 0: + # TODO: Can remove once empty constructor default becomes + # object instead of float. + return as_column( + pa.array([], type=pa.string()), + nan_as_null=nan_as_null, + dtype=dtype, + length=length, + ) if isinstance(arbitrary, pd.arrays.PandasArray): # infer_dtype does not handle PandasArray arbitrary = np.array(arbitrary, dtype=object) @@ -2140,15 +2151,9 @@ def as_column( arbitrary, from_pandas=True, ) - if isinstance(pyarrow_array.type, pa.Decimal128Type): - pyarrow_type = cudf.Decimal128Dtype.from_arrow( - pyarrow_array.type - ) - else: - pyarrow_type = arbitrary.dtype data = as_column( pyarrow_array, - dtype=pyarrow_type, + dtype=dtype, nan_as_null=nan_as_null, length=length, ) @@ -2280,7 +2285,7 @@ def as_column( if dtype is not None: data = data.astype(dtype) elif arb_dtype.kind in ("O", "U"): - data = as_column(pa.array(arbitrary), dtype=arbitrary.dtype) + data = as_column(pa.array(arbitrary), dtype=dtype) # There is no cast operation available for pa.Array from int to # str, Hence instead of handling in pa.Array block, we # will have to type-cast here. diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8898bf70047..36033a72479 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2650,6 +2650,26 @@ def test_astype_pandas_nullable_pandas_compat(dtype, klass, kind): ser.astype(kind(dtype)) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) +@pytest.mark.parametrize( + "data", + [ + pa.array([1, None], type=pa.int64()), + pa.chunked_array([[1, None]], type=pa.int64()), + ], +) +def test_from_arrow_array_dtype(klass, data): + obj = klass(data, dtype="int8") + assert obj.dtype == np.dtype("int8") + + +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) +def test_from_pandas_object_dtype_passed_dtype(klass): + result = klass(pd.Series([True, False], dtype=object), dtype="int8") + expected = klass(pa.array([1, 0], type=pa.int8())) + assert_eq(result, expected) + + def test_series_where_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): From 0d87bb7e6edbb108be04773afe7d0eee82258131 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 11 Jan 2024 16:04:59 -0600 Subject: [PATCH 11/23] refactor CUDA versions in dependencies.yaml (#14733) Follow-up to #14644. Contributes to https://github.com/rapidsai/build-planning/issues/7. Similar to https://github.com/rapidsai/rmm/pull/1422, this proposes splitting the `cuda-version` dependency in `dependencies.yaml` out to its own thing, separate from the bits of the CUDA Toolkit `cudf` needs. Some other simplifications: * removes the notebook-specific stuff added in #14722 (which I think were added specifically because `cuda-version` and CTK stuff was coupled) * consolidates two sections with selectors only based on CUDA `{major}.{minor}` Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Ray Douglass (https://github.com/raydouglass) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) --- dependencies.yaml | 69 +++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 94f31240797..28b3afd7bbc 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -11,7 +11,8 @@ files: - build_wheels - build_python_common - build_python_cudf - - cudatoolkit + - cuda + - cuda_version - develop - docs - libarrow_build @@ -29,31 +30,32 @@ files: test_cpp: output: none includes: - - cudatoolkit - - test_cpp + - cuda_version - libarrow_run + - test_cpp test_python: output: none includes: - - cudatoolkit + - cuda_version - py_version + - pyarrow_run - test_python_common - test_python_cudf - test_python_dask_cudf - - pyarrow_run test_java: output: none includes: - build_all + - cuda + - cuda_version - libarrow_run - - cudatoolkit - test_java test_notebooks: output: none includes: + - cuda_version - notebooks - py_version - - notebook_cuda_version checks: output: none includes: @@ -62,7 +64,8 @@ files: docs: output: none includes: - - cudatoolkit + - cuda + - cuda_version - docs - libarrow_run - py_version @@ -333,37 +336,51 @@ dependencies: # Allow runtime version to float up to minor version # Disallow pyarrow 14.0.0 due to a CVE - pyarrow>=14.0.1,<15.0.0a0 - cudatoolkit: + cuda_version: specific: - output_types: conda matrices: - matrix: - cuda: "12.*" + cuda: "11.2" packages: - - cuda-cudart-dev - - cuda-nvrtc-dev - - cuda-nvtx-dev - - libcurand-dev - - matrix: # Fallback for CUDA 11 or no matrix + - cuda-version=11.2 + - matrix: + cuda: "11.4" packages: - - cudatoolkit - - output_types: conda - matrices: + - cuda-version=11.4 + - matrix: + cuda: "11.5" + packages: + - cuda-version=11.5 + - matrix: + cuda: "11.8" + packages: + - cuda-version=11.8 - matrix: cuda: "12.0" packages: - cuda-version=12.0 + cuda: + specific: + - output_types: conda + matrices: + - matrix: + cuda: "12.*" + packages: + - cuda-cudart-dev + - cuda-nvrtc-dev + - cuda-nvtx-dev + - libcurand-dev - matrix: cuda: "11.8" packages: - - cuda-version=11.8 + - cudatoolkit - cuda-nvtx=11.8 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 - matrix: cuda: "11.5" packages: - - cuda-version=11.5 - cudatoolkit - cuda-nvtx=11.5 # Can't hard pin the version since 11.x is missing many @@ -373,7 +390,6 @@ dependencies: - matrix: cuda: "11.4" packages: - - cuda-version=11.4 - cudatoolkit - &cudanvtx114 cuda-nvtx=11.4 - &libcurand_dev114 libcurand-dev>=10.2.5.43,<=10.2.5.120 @@ -381,7 +397,6 @@ dependencies: - matrix: cuda: "11.2" packages: - - cuda-version=11.2 - cudatoolkit # The NVIDIA channel doesn't publish pkgs older than 11.4 for # these libs, so 11.2 uses 11.4 packages (the oldest @@ -753,13 +768,3 @@ dependencies: packages: - ipython - openpyxl - notebook_cuda_version: - specific: - - output_types: conda - matrices: - - matrix: {cuda: "12.0"} - packages: - - cuda-version=12.0 - - matrix: {cuda: "11.8"} - packages: - - cuda-version=11.8 From e50fa00aed685395a16d252787a834d308a548bc Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 11 Jan 2024 14:58:50 -0800 Subject: [PATCH 12/23] Expose streams in Parquet reader and writer APIs (#14359) This PR contributes to https://github.com/rapidsai/cudf/issues/13744. -Added stream parameters to public APIs ``` cudf::io::read_parquet cudf::io::write_parquet cudf::io::parquet_chunked_writer cudf::io::chunked_parquet_reader ``` -Added stream gtests Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Mark Harris (https://github.com/harrism) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/14359 --- cpp/include/cudf/io/parquet.hpp | 16 ++- cpp/src/io/functions.cpp | 27 ++--- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/io/parquet_test.cpp | 138 ++++++++++++++++++++++++++ 4 files changed, 166 insertions(+), 16 deletions(-) create mode 100644 cpp/tests/streams/io/parquet_test.cpp diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ea18da74d5a..dc035db8d39 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -401,6 +401,7 @@ class parquet_reader_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata * @@ -408,6 +409,7 @@ class parquet_reader_options_builder { */ table_with_metadata read_parquet( parquet_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -438,11 +440,13 @@ class chunked_parquet_reader { * @param chunk_read_limit Limit on total number of bytes to be returned per read, * or `0` if there is no limit * @param options The options used to read Parquet file + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ chunked_parquet_reader( std::size_t chunk_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -461,12 +465,14 @@ class chunked_parquet_reader { * @param pass_read_limit Limit on the amount of memory used for reading and decompressing data or * `0` if there is no limit * @param options The options used to read Parquet file + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ chunked_parquet_reader( std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -1163,11 +1169,13 @@ class parquet_writer_options_builder { * @endcode * * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if * requested in parquet_writer_options (empty blob otherwise). */ -std::unique_ptr> write_parquet(parquet_writer_options const& options); +std::unique_ptr> write_parquet( + parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Merges multiple raw metadata blobs that were previously created by write_parquet @@ -1778,8 +1786,10 @@ class parquet_chunked_writer { * @brief Constructor with chunked writer options * * @param[in] options options used to write table + * @param[in] stream CUDA stream used for device memory operations and kernel launches */ - parquet_chunked_writer(chunked_parquet_writer_options const& options); + parquet_chunked_writer(chunked_parquet_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Writes table to output. diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a9049d5640e..e5489963618 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -488,13 +488,14 @@ using namespace cudf::io::parquet::detail; namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); - auto reader = std::make_unique( - std::move(datasources), options, cudf::get_default_stream(), mr); + auto reader = + std::make_unique(std::move(datasources), options, stream, mr); return reader->read(options); } @@ -554,7 +555,8 @@ table_input_metadata::table_input_metadata(table_metadata const& metadata) /** * @copydoc cudf::io::write_parquet */ -std::unique_ptr> write_parquet(parquet_writer_options const& options) +std::unique_ptr> write_parquet(parquet_writer_options const& options, + rmm::cuda_stream_view stream) { namespace io_detail = cudf::io::detail; @@ -562,7 +564,7 @@ std::unique_ptr> write_parquet(parquet_writer_options const auto sinks = make_datasinks(options.get_sink()); auto writer = std::make_unique( - std::move(sinks), options, io_detail::single_write_mode::YES, cudf::get_default_stream()); + std::move(sinks), options, io_detail::single_write_mode::YES, stream); writer->write(options.get_table(), options.get_partitions()); @@ -574,13 +576,10 @@ std::unique_ptr> write_parquet(parquet_writer_options const */ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : reader{std::make_unique(chunk_read_limit, - 0, - make_datasources(options.get_source()), - options, - cudf::get_default_stream(), - mr)} + : reader{std::make_unique( + chunk_read_limit, 0, make_datasources(options.get_source()), options, stream, mr)} { } @@ -590,12 +589,13 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, std::size_t pass_read_limit, parquet_reader_options const& options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) : reader{std::make_unique(chunk_read_limit, pass_read_limit, make_datasources(options.get_source()), options, - cudf::get_default_stream(), + stream, mr)} { } @@ -628,14 +628,15 @@ table_with_metadata chunked_parquet_reader::read_chunk() const /** * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ -parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options) +parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options, + rmm::cuda_stream_view stream) { namespace io_detail = cudf::io::detail; auto sinks = make_datasinks(options.get_sink()); writer = std::make_unique( - std::move(sinks), options, io_detail::single_write_mode::NO, cudf::get_default_stream()); + std::move(sinks), options, io_detail::single_write_mode::NO, stream); } /** diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 48bc4ac6fc1..f7b805b68f5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -655,6 +655,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp new file mode 100644 index 00000000000..c6d531bc376 --- /dev/null +++ b/cpp/tests/streams/io/parquet_test.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// Global environment for temporary files +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +class ParquetTest : public cudf::test::BaseFixture {}; + +template +std::vector> make_uniqueptrs_vector(UniqPtrs&&... uniqptrs) +{ + std::vector> ptrsvec; + (ptrsvec.push_back(std::forward(uniqptrs)), ...); + return ptrsvec; +} + +cudf::table construct_table() +{ + constexpr auto num_rows = 10; + + std::vector zeros(num_rows, 0); + std::vector ones(num_rows, 1); + + cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col2(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col6 = [&ones, num_rows] { + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{12}}; + }); + return cudf::test::fixed_width_column_wrapper(col6_data, + col6_data + num_rows); + }(); + cudf::test::fixed_width_column_wrapper col7 = [&ones, num_rows] { + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{-12}}; + }); + return cudf::test::fixed_width_column_wrapper(col7_data, + col7_data + num_rows); + }(); + + cudf::test::lists_column_wrapper col8{ + {1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}}; + + cudf::test::structs_column_wrapper col9 = [&ones] { + cudf::test::fixed_width_column_wrapper child_col(ones.begin(), ones.end()); + return cudf::test::structs_column_wrapper{child_col}; + }(); + + cudf::test::strings_column_wrapper col10 = [] { + std::vector col10_data(num_rows, "rapids"); + return cudf::test::strings_column_wrapper(col10_data.begin(), col10_data.end()); + }(); + + auto colsptr = make_uniqueptrs_vector(col0.release(), + col1.release(), + col2.release(), + col3.release(), + col4.release(), + col5.release(), + col6.release(), + col7.release(), + col8.release(), + col9.release(), + col10.release()); + return cudf::table(std::move(colsptr)); +} + +TEST_F(ParquetTest, ParquetWriter) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tab); + cudf::io::write_parquet(out_opts, cudf::test::get_default_stream()); +} + +TEST_F(ParquetTest, ParquetReader) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tab); + cudf::io::write_parquet(out_opts, cudf::test::get_default_stream()); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts, cudf::test::get_default_stream()); + auto meta = cudf::io::read_parquet_metadata(cudf::io::source_info{filepath}); +} + +TEST_F(ParquetTest, ChunkedOperations) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet"); + cudf::io::chunked_parquet_writer_options out_opts = + cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath}); + cudf::io::parquet_chunked_writer(out_opts, cudf::test::get_default_stream()).write(tab); + + auto reader = cudf::io::chunked_parquet_reader( + 1L << 31, + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}), + cudf::test::get_default_stream()); + while (reader.has_next()) { + auto chunk = reader.read_chunk(); + } +} From 9937c7f742ee4b453aa26198f4821095db40e671 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:07:49 -1000 Subject: [PATCH 13/23] Remove unnecessary **kwargs in function signatures (#14635) Helps makes function signatures stricter to avoid typo inputs being accepted into signatures Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14635 --- python/cudf/cudf/_lib/groupby.pyx | 4 +- python/cudf/cudf/_lib/string_casting.pyx | 36 +++++------- python/cudf/cudf/core/column/categorical.py | 18 ++++-- python/cudf/cudf/core/column/column.py | 59 +++++++++----------- python/cudf/cudf/core/column/datetime.py | 14 +++-- python/cudf/cudf/core/column/decimal.py | 9 ++- python/cudf/cudf/core/column/interval.py | 4 +- python/cudf/cudf/core/column/lists.py | 6 +- python/cudf/cudf/core/column/numerical.py | 10 ++-- python/cudf/cudf/core/column/string.py | 14 ++--- python/cudf/cudf/core/column/timedelta.py | 10 ++-- python/cudf/cudf/core/dataframe.py | 17 +++--- python/cudf/cudf/core/indexed_frame.py | 4 +- python/cudf/cudf/core/series.py | 2 - python/cudf/cudf/core/single_column_frame.py | 4 +- 15 files changed, 104 insertions(+), 107 deletions(-) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index b3778e45cde..f332fead8d1 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from pandas.core.groupby.groupby import DataError @@ -104,7 +104,7 @@ cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj cdef dict __dict__ - def __cinit__(self, list keys, bool dropna=True, *args, **kwargs): + def __cinit__(self, list keys, bool dropna=True): cdef libcudf_types.null_policy c_null_handling cdef table_view keys_view diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 4b44ac83a70..3826e71f850 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.column cimport Column @@ -95,7 +95,7 @@ def dtos(Column input_col): return floating_to_string(input_col) -def stod(Column input_col, **kwargs): +def stod(Column input_col): """ Converting/Casting input column of type string to double @@ -127,7 +127,7 @@ def ftos(Column input_col): return floating_to_string(input_col) -def stof(Column input_col, **kwargs): +def stof(Column input_col): """ Converting/Casting input column of type string to float @@ -188,7 +188,7 @@ def i8tos(Column input_col): return integer_to_string(input_col) -def stoi8(Column input_col, **kwargs): +def stoi8(Column input_col): """ Converting/Casting input column of type string to int8 @@ -284,7 +284,7 @@ def ltos(Column input_col): return integer_to_string(input_col) -def stol(Column input_col, **kwargs): +def stol(Column input_col): """ Converting/Casting input column of type string to int64 @@ -316,7 +316,7 @@ def ui8tos(Column input_col): return integer_to_string(input_col) -def stoui8(Column input_col, **kwargs): +def stoui8(Column input_col): """ Converting/Casting input column of type string to uint8 @@ -348,7 +348,7 @@ def ui16tos(Column input_col): return integer_to_string(input_col) -def stoui16(Column input_col, **kwargs): +def stoui16(Column input_col): """ Converting/Casting input column of type string to uint16 @@ -380,7 +380,7 @@ def uitos(Column input_col): return integer_to_string(input_col) -def stoui(Column input_col, **kwargs): +def stoui(Column input_col): """ Converting/Casting input column of type string to uint32 @@ -412,7 +412,7 @@ def ultos(Column input_col): return integer_to_string(input_col) -def stoul(Column input_col, **kwargs): +def stoul(Column input_col): """ Converting/Casting input column of type string to uint64 @@ -456,7 +456,7 @@ def _to_booleans(Column input_col, object string_true="True"): return Column.from_unique_ptr(move(c_result)) -def to_booleans(Column input_col, **kwargs): +def to_booleans(Column input_col): return _to_booleans(input_col) @@ -631,9 +631,7 @@ def timedelta2int(Column input_col, dtype, format): return Column.from_unique_ptr(move(c_result)) -def int2timedelta( - Column input_col, - **kwargs): +def int2timedelta(Column input_col, str format): """ Converting/Casting input Timedelta column to string column with specified format @@ -649,8 +647,7 @@ def int2timedelta( """ cdef column_view input_column_view = input_col.view() - cdef string c_duration_format = kwargs.get( - 'format', "%D days %H:%M:%S").encode('UTF-8') + cdef string c_duration_format = format.encode('UTF-8') cdef unique_ptr[column] c_result with nogil: c_result = move( @@ -661,7 +658,7 @@ def int2timedelta( return Column.from_unique_ptr(move(c_result)) -def int2ip(Column input_col, **kwargs): +def int2ip(Column input_col): """ Converting/Casting integer column to string column in ipv4 format @@ -684,7 +681,7 @@ def int2ip(Column input_col, **kwargs): return Column.from_unique_ptr(move(c_result)) -def ip2int(Column input_col, **kwargs): +def ip2int(Column input_col): """ Converting string ipv4 column to integer column @@ -732,7 +729,6 @@ def htoi(Column input_col, **kwargs): Parameters ---------- input_col : input column of type string - out_type : The type of integer column expected Returns ------- @@ -742,9 +738,7 @@ def htoi(Column input_col, **kwargs): cdef column_view input_column_view = input_col.view() cdef type_id tid = ( ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - kwargs.get('dtype', cudf.dtype("int64")) - ] + SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] ) ) cdef data_type c_out_type = data_type(tid) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 59fd4631067..71143fa7a95 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1310,22 +1310,28 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn: new_categories=dtype.categories, ordered=bool(dtype.ordered) ) - def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column(self, dtype, format=None, **kwargs) -> StringColumn: + def as_string_column( + self, dtype, format: str | None = None + ) -> StringColumn: return self._get_decategorized_column().as_string_column( dtype, format=format ) - def as_datetime_column(self, dtype, **kwargs) -> DatetimeColumn: + def as_datetime_column( + self, dtype, format: str | None = None + ) -> DatetimeColumn: return self._get_decategorized_column().as_datetime_column( - dtype, **kwargs + dtype, format ) - def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: + def as_timedelta_column( + self, dtype, format: str | None = None + ) -> TimeDeltaColumn: return self._get_decategorized_column().as_timedelta_column( - dtype, **kwargs + dtype, format ) def _get_decategorized_column(self) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 19e76d4a95b..81579b53bb7 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -978,11 +978,17 @@ def distinct_count(self, dropna: bool = True) -> int: def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() - def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: + def astype( + self, dtype: Dtype, copy: bool = False, format: str | None = None + ) -> ColumnBase: + if copy: + col = self.copy() + else: + col = self if self.dtype == dtype: - return self + return col if is_categorical_dtype(dtype): - return self.as_categorical_column(dtype) + return col.as_categorical_column(dtype) if ( isinstance(dtype, str) @@ -999,9 +1005,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: else: dtype = pandas_dtypes_to_np_dtypes.get(dtype, dtype) if _is_non_decimal_numeric_dtype(dtype): - return self.as_numerical_column(dtype, **kwargs) + return col.as_numerical_column(dtype) elif is_categorical_dtype(dtype): - return self.as_categorical_column(dtype) + return col.as_categorical_column(dtype) elif cudf.dtype(dtype).type in { np.str_, np.object_, @@ -1014,23 +1020,23 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: f"Casting to {dtype} is not supported, use " "`.astype('str')` instead." ) - return self.as_string_column(dtype, **kwargs) + return col.as_string_column(dtype, format=format) elif isinstance(dtype, (ListDtype, StructDtype)): - if not self.dtype == dtype: + if not col.dtype == dtype: raise NotImplementedError( f"Casting {self.dtype} columns not currently supported" ) - return self + return col elif isinstance(dtype, IntervalDtype): - return self.as_interval_column(dtype, **kwargs) + return col.as_interval_column(dtype) elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): - return self.as_decimal_column(dtype, **kwargs) + return col.as_decimal_column(dtype) elif np.issubdtype(cast(Any, dtype), np.datetime64): - return self.as_datetime_column(dtype, **kwargs) + return col.as_datetime_column(dtype, format=format) elif np.issubdtype(cast(Any, dtype), np.timedelta64): - return self.as_timedelta_column(dtype, **kwargs) + return col.as_timedelta_column(dtype, format=format) else: - return self.as_numerical_column(dtype, **kwargs) + return col.as_numerical_column(dtype) def as_categorical_column(self, dtype) -> ColumnBase: if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)): @@ -1076,50 +1082,35 @@ def as_categorical_column(self, dtype) -> ColumnBase: ) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": raise NotImplementedError def as_interval_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.IntervalColumn": raise NotImplementedError def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": raise NotImplementedError def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": raise NotImplementedError def as_decimal_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]: raise NotImplementedError - def as_decimal128_column( - self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.Decimal128Column": - raise NotImplementedError - - def as_decimal64_column( - self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.Decimal64Column": - raise NotImplementedError - - def as_decimal32_column( - self, dtype: Dtype, **kwargs - ) -> "cudf.core.column.Decimal32Column": - raise NotImplementedError - def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask) if not is_bool_dtype(mask.dtype): diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 2b44b46bb9e..2ab2dd46c53 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -422,21 +422,23 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: ) return output - def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn: + def as_datetime_column( + self, dtype: Dtype, format: str | None = None + ) -> DatetimeColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": raise TypeError( f"cannot astype a datetimelike from {self.dtype} to {dtype}" ) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, @@ -448,7 +450,7 @@ def as_numerical_column( return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get( @@ -725,9 +727,9 @@ def _local_time(self): return utc_to_local(self, str(self.dtype.tz)) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": - return self._local_time.as_string_column(dtype, format, **kwargs) + return self._local_time.as_string_column(dtype, format) def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 299875f0091..0e90b522f2c 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,5 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations + import warnings from decimal import Decimal from typing import Any, Optional, Sequence, Union, cast @@ -37,7 +39,8 @@ class DecimalBaseColumn(NumericalBaseColumn): _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS def as_decimal_column( - self, dtype: Dtype, **kwargs + self, + dtype: Dtype, ) -> Union["DecimalBaseColumn"]: if ( isinstance(dtype, cudf.core.dtypes.DecimalDtype) @@ -53,7 +56,7 @@ def as_decimal_column( return libcudf.unary.cast(self, dtype) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if len(self) > 0: return cpp_from_decimal(self) @@ -201,7 +204,7 @@ def _decimal_quantile( return result._with_type_metadata(self.dtype) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": return libcudf.unary.cast(self, dtype) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index eed7bba3628..81059717b20 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from typing import Optional import pandas as pd @@ -99,7 +99,7 @@ def copy(self, deep=True): closed=closed, ) - def as_interval_column(self, dtype, **kwargs): + def as_interval_column(self, dtype): if isinstance(dtype, IntervalDtype): if isinstance(self.dtype, CategoricalDtype): new_struct = self._get_decategorized_column() diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index a5653e66513..0cccec6f28a 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from __future__ import annotations from functools import cached_property from typing import List, Optional, Sequence, Tuple, Union @@ -243,7 +245,7 @@ def from_sequences( return res def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": """ Create a strings column from a list column diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 148fa252fad..5461d1b13b5 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -340,7 +340,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn": return libcudf.string_casting.int2ip(self) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[ @@ -353,7 +353,7 @@ def as_string_column( ) def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": return cast( "cudf.core.column.DatetimeColumn", @@ -367,7 +367,7 @@ def as_datetime_column( ) def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": return cast( "cudf.core.column.TimeDeltaColumn", @@ -381,11 +381,11 @@ def as_timedelta_column( ) def as_decimal_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": return libcudf.unary.cast(self, dtype) - def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06b5ac31ca6..84333fc205a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5633,7 +5633,7 @@ def __contains__(self, item: ScalarLike) -> bool: ) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": out_dtype = cudf.api.types.dtype(dtype) string_col = self @@ -5696,14 +5696,13 @@ def _as_datetime_or_timedelta_column(self, dtype, format): return result_col def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": out_dtype = cudf.api.types.dtype(dtype) # infer on host from the first not na element # or return all null column if all values # are null in current column - format = kwargs.get("format", None) if format is None: if self.null_count == len(self): return cast( @@ -5720,19 +5719,20 @@ def as_datetime_column( return self._as_datetime_or_timedelta_column(out_dtype, format) def as_timedelta_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.TimeDeltaColumn": out_dtype = cudf.api.types.dtype(dtype) - format = "%D days %H:%M:%S" + if format is None: + format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) def as_decimal_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.DecimalBaseColumn": return libstrings.to_decimal(self, dtype) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> StringColumn: return self diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 2f842130f48..6038a1a1e97 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -288,7 +288,7 @@ def fillna( return super().fillna(fill_value, method) def as_numerical_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype ) -> "cudf.core.column.NumericalColumn": col = column.build_column( data=self.base_data, @@ -300,14 +300,14 @@ def as_numerical_column( return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) def as_datetime_column( - self, dtype: Dtype, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.DatetimeColumn": raise TypeError( f"cannot astype a timedelta from {self.dtype} to {dtype}" ) def as_string_column( - self, dtype: Dtype, format=None, **kwargs + self, dtype: Dtype, format: str | None = None ) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get( @@ -323,7 +323,9 @@ def as_string_column( column.column_empty(0, dtype="object", masked=False), ) - def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: + def as_timedelta_column( + self, dtype: Dtype, format: str | None = None + ) -> TimeDeltaColumn: dtype = cudf.dtype(dtype) if dtype == self.dtype: return self diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 813ecc32069..51b661593fc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -24,6 +24,7 @@ Set, Tuple, Union, + cast, ) import cupy @@ -1986,8 +1987,6 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - *args, - **kwargs, ) -> Tuple[ Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -2338,7 +2337,7 @@ def to_dict( @_cudf_nvtx_annotate def scatter_by_map( - self, map_index, map_size=None, keep_index=True, **kwargs + self, map_index, map_size=None, keep_index=True, debug: bool = False ): """Scatter to a list of dataframes. @@ -2379,7 +2378,11 @@ def scatter_by_map( # Convert string or categorical to integer if isinstance(map_index, cudf.core.column.StringColumn): - map_index = map_index.as_categorical_column("category").codes + cat_index = cast( + cudf.core.column.CategoricalColumn, + map_index.as_categorical_column("category"), + ) + map_index = cat_index.codes warnings.warn( "Using StringColumn for map_index in scatter_by_map. " "Use an integer array/column for better performance." @@ -2391,7 +2394,7 @@ def scatter_by_map( "Use an integer array/column for better performance." ) - if kwargs.get("debug", False) == 1 and map_size is not None: + if debug and map_size is not None: count = map_index.distinct_count() if map_size < count: raise ValueError( @@ -2406,7 +2409,7 @@ def scatter_by_map( partitioned = self._from_columns_like_self( partitioned_columns, column_names=self._column_names, - index_names=self._index_names if keep_index else None, + index_names=list(self._index_names) if keep_index else None, ) # due to the split limitation mentioned @@ -2537,7 +2540,7 @@ def items(self): yield (k, self[k]) @_cudf_nvtx_annotate - def equals(self, other, **kwargs): + def equals(self, other): ret = super().equals(other) # If all other checks matched, validate names. if ret: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ab089ceb103..5955e21fea0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -3612,8 +3612,6 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - *args, - **kwargs, ) -> Tuple[ Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8739a61dd8b..df5a62b384e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1578,8 +1578,6 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - *args, - **kwargs, ) -> Tuple[ Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 911e7ac905c..b73f756d7dc 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -310,8 +310,6 @@ def _make_operands_for_binop( other: Any, fill_value: Any = None, reflect: bool = False, - *args, - **kwargs, ) -> Union[ Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], NotImplementedType, From 9e6400b7d1f9d525b7f45e7b56874dc830c02d1f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 11 Jan 2024 18:38:07 -0600 Subject: [PATCH 14/23] Describe unpickling expectations when cudf.pandas is enabled (#14693) Adds to the docs the unpickling expectations that were noted in #14692. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/14693 --- docs/cudf/source/cudf_pandas/faq.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index bf9c2b98c2d..bbeaf0a5f00 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -113,6 +113,9 @@ There are a few known limitations that you should be aware of: pandas - `cudf.pandas` isn't compatible with directly using `import cudf` and is intended to be used with pandas-based workflows. +- Unpickling objects that were pickled with "regular" pandas will not + work: you must have pickled an object with `cudf.pandas` enabled for + it to be unpickled when `cudf.pandas` is enabled. - Global variables can be accessed but can't be modified during CPU-fallback ```python From 3c55a6e82e1a53581e1efd1b29d9bc0802bb054b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 11 Jan 2024 16:57:26 -0800 Subject: [PATCH 15/23] Fix CMake args (#14746) This was an oversight in #13531 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/14746 --- ci/build_wheel_cudf.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index e79b9a35aa2..cde22bb70d1 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -1,11 +1,11 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. set -euo pipefail package_dir="python/cudf" -export SKBUILD_CONFIGURE_OPTIONS="-DUSE_LIBARROW_FROM_PYARROW=ON" +export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" ./ci/build_wheel.sh cudf ${package_dir} From 2003ea2e8d2b03fb1b3a3c2f2046893395328fd2 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Thu, 11 Jan 2024 20:16:21 -0500 Subject: [PATCH 16/23] Remove usages of rapids-env-update (#14748) Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/14748 --- ci/build_cpp.sh | 8 ++++++-- ci/build_python.sh | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index f1ad8ee7778..740a6409ccd 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,9 +1,13 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail -source rapids-env-update +rapids-configure-conda-channels + +source rapids-configure-sccache + +source rapids-date-string export CMAKE_GENERATOR=Ninja diff --git a/ci/build_python.sh b/ci/build_python.sh index 32fe7b6b3ce..3c2a7761e1a 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -1,9 +1,13 @@ #!/bin/bash -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. set -euo pipefail -source rapids-env-update +rapids-configure-conda-channels + +source rapids-configure-sccache + +source rapids-date-string export CMAKE_GENERATOR=Ninja From c0a3cd14eabd18ba8cedd3b7dd87cba8b6706719 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:13:59 -1000 Subject: [PATCH 17/23] Clean up base column methods (#14725) * Removed the need for a `drop_nan` argument in `Column.dropna` * Removed the need for `Column.as_frame` * Removed the need for `Column.force_deep_copy` Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14725 --- .../cudf/benchmarks/internal/bench_column.py | 7 ++--- python/cudf/cudf/core/column/categorical.py | 11 ++++---- python/cudf/cudf/core/column/column.py | 28 ++++--------------- python/cudf/cudf/core/column/interval.py | 5 +--- python/cudf/cudf/core/column/numerical.py | 5 ---- python/cudf/cudf/io/dlpack.py | 4 +-- 6 files changed, 18 insertions(+), 42 deletions(-) diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py index d4969b39f7f..8da769b7858 100644 --- a/python/cudf/benchmarks/internal/bench_column.py +++ b/python/cudf/benchmarks/internal/bench_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of Column methods.""" @@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column): @benchmark_with_object(cls="column", dtype="float") -@pytest.mark.parametrize("dropnan", [True, False]) -def bench_dropna(benchmark, column, dropnan): - benchmark(column.dropna, drop_nan=dropnan) +def bench_dropna(benchmark, column): + benchmark(column.dropna) @benchmark_with_object(cls="column", dtype="float") diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 71143fa7a95..eb4220c5895 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -987,15 +987,16 @@ def to_pandas( .fillna(_DEFAULT_CATEGORICAL_VALUE) .values_host ) - if isinstance(col.categories.dtype, IntervalDtype): + cats = col.categories + if cats.dtype.kind in "biuf": + cats = cats.nans_to_nulls().dropna() # type: ignore[attr-defined] + elif not isinstance(cats.dtype, IntervalDtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. # TODO: work on interval index dropna - categories = col.categories.to_pandas() - else: - categories = col.categories.dropna(drop_nan=True).to_pandas() + cats = cats.dropna() data = pd.Categorical.from_codes( - codes, categories=categories, ordered=col.ordered + codes, categories=cats.to_pandas(), ordered=col.ordered ) return pd.Series(data, index=index) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 81579b53bb7..3cf686da7b0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -109,16 +109,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } - def as_frame(self) -> "cudf.core.frame.Frame": - """ - Converts a Column to Frame - """ - return cudf.core.single_column_frame.SingleColumnFrame( - {None: self.copy(deep=False)} - ) - def data_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object @@ -155,7 +147,7 @@ def data_array_view( return cuda.as_cuda_array(obj).view(self.dtype) def mask_array_view( - self, *, mode="write" + self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array @@ -291,8 +283,7 @@ def any(self, skipna: bool = True) -> bool: return libcudf.reduce.reduce("any", self, dtype=np.bool_) - def dropna(self, drop_nan: bool = False) -> ColumnBase: - # The drop_nan argument is only used for numerical columns. + def dropna(self) -> ColumnBase: return drop_nulls([self])[0]._with_type_metadata(self.dtype) def to_arrow(self) -> pa.Array: @@ -437,14 +428,6 @@ def nullmask(self) -> Buffer: raise ValueError("Column has no null mask") return self.mask_array_view(mode="read") - def force_deep_copy(self) -> Self: - """ - A method to create deep copy irrespective of whether - `copy-on-write` is enabled. - """ - result = libcudf.copying.copy_column(self) - return result._with_type_metadata(self.dtype) - def copy(self, deep: bool = True) -> Self: """ Makes a copy of the Column. @@ -464,7 +447,8 @@ def copy(self, deep: bool = True) -> Self: them. """ if deep: - return self.force_deep_copy() + result = libcudf.copying.copy_column(self) + return result._with_type_metadata(self.dtype) else: return cast( Self, @@ -1069,7 +1053,7 @@ def as_categorical_column(self, dtype) -> ColumnBase: ) # columns include null index in factorization; remove: if self.has_nulls(): - cats = cats.dropna(drop_nan=False) + cats = cats.dropna() min_type = min_unsigned_type(len(cats), 8) if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 81059717b20..6a7e7729123 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -142,7 +142,4 @@ def element_indexing(self, index: int): result = super().element_indexing(index) if cudf.get_option("mode.pandas_compatible"): return pd.Interval(**result, closed=self._closed) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } + return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 5461d1b13b5..0577e0f37ed 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -20,7 +20,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.stream_compaction import drop_nulls from cudf._lib.types import size_type_dtype from cudf._typing import ( ColumnBinaryOperand, @@ -421,10 +420,6 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count - def dropna(self, drop_nan: bool = False) -> NumericalColumn: - col = self.nans_to_nulls() if drop_nan else self - return drop_nulls([col])[0] - def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index e1950c9f250..bed376e4a79 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import cudf @@ -71,7 +71,7 @@ def to_dlpack(cudf_obj): if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)): gdf = cudf_obj elif isinstance(cudf_obj, ColumnBase): - gdf = cudf_obj.as_frame() + gdf = cudf.Series._from_data({None: cudf_obj}) else: raise TypeError( f"Input of type {type(cudf_obj)} cannot be converted " From 7a42b8b57923b9515391cfe2c4668380b15ed118 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:14:30 -1000 Subject: [PATCH 18/23] Use as_column instead of arange for range like inputs (#14689) 1. Allows range-like inputs in `as_column` to short circuit and not materialize when creating columns 2. Avoids diverging column construction logic between `column.arange` and `column.as_column` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14689 --- python/cudf/cudf/core/column/__init__.py | 3 +- python/cudf/cudf/core/column/categorical.py | 12 ++- python/cudf/cudf/core/column/column.py | 99 +++++---------------- python/cudf/cudf/core/dataframe.py | 10 ++- python/cudf/cudf/core/groupby/groupby.py | 10 ++- python/cudf/cudf/core/index.py | 4 +- python/cudf/cudf/core/indexed_frame.py | 18 ++-- python/cudf/cudf/core/join/join.py | 8 +- python/cudf/cudf/core/multiindex.py | 16 ++-- python/cudf/cudf/core/series.py | 9 +- python/cudf/cudf/core/window/rolling.py | 6 +- python/cudf/cudf/tests/test_column.py | 10 +-- 12 files changed, 79 insertions(+), 126 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index aba4ded4f9d..3dddcae85dc 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. """ isort: skip_file @@ -8,7 +8,6 @@ from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( ColumnBase, - arange, as_column, build_categorical_column, build_column, diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index eb4220c5895..f52621dc444 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1159,7 +1159,7 @@ def find_and_replace( new_cats_col = new_cats_col.apply_boolean_mask(bmask) new_cats = cudf.DataFrame._from_data( { - "index": cudf.core.column.arange(len(new_cats_col)), + "index": column.as_column(range(len(new_cats_col))), "cats": new_cats_col, } ) @@ -1531,9 +1531,13 @@ def _set_categories( ) out_code_dtype = min_unsigned_type(max_cat_size) - cur_order = column.arange(len(cur_codes)) - old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) - new_codes = column.arange(len(new_cats), dtype=out_code_dtype) + cur_order = column.as_column(range(len(cur_codes))) + old_codes = column.as_column( + range(len(cur_cats)), dtype=out_code_dtype + ) + new_codes = column.as_column( + range(len(new_cats)), dtype=out_code_dtype + ) new_df = cudf.DataFrame._from_data( data={"new_codes": new_codes, "cats": new_cats} diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3cf686da7b0..c13ec33c51c 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -554,10 +554,8 @@ def slice( ]._with_type_metadata(self.dtype) else: # Need to create a gather map for given slice with stride - gather_map = arange( - start=start, - stop=stop, - step=stride, + gather_map = as_column( + range(start, stop, stride), dtype=cudf.dtype(np.int32), ) return self.take(gather_map) @@ -626,10 +624,8 @@ def _scatter_by_slice( ) # step != 1, create a scatter map with arange - scatter_map = arange( - start=start, - stop=stop, - step=step, + scatter_map = as_column( + range(start, stop, step), dtype=cudf.dtype(np.int32), ) @@ -745,7 +741,7 @@ def indices_of( assert len(value) == 1 mask = libcudf.search.contains(value, self) return apply_boolean_mask( - [arange(0, len(self), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: @@ -1379,7 +1375,9 @@ def _return_sentinel_column(): [self], [cats], how="left" ) codes = libcudf.copying.gather( - [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True + [as_column(range(len(cats)), dtype=dtype)], + right_gather_map, + nullify=True, ) del right_gather_map # reorder `codes` so that its values correspond to the @@ -1905,13 +1903,26 @@ def as_column( * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects + * range objects """ - if isinstance(arbitrary, ColumnBase): + if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)): + column = libcudf.filling.sequence( + len(arbitrary), + as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")), + as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), + ) + if cudf.get_option("default_integer_bitwidth") and dtype is None: + dtype = cudf.dtype( + f'i{cudf.get_option("default_integer_bitwidth")//8}' + ) + if dtype is not None: + column = column.astype(dtype) + return column + elif isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary - elif isinstance(arbitrary, cudf.Series): data = arbitrary._column if dtype is not None: @@ -2614,70 +2625,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: return columns -def arange( - start: Union[int, float], - stop: Optional[Union[int, float]] = None, - step: Union[int, float] = 1, - dtype=None, -) -> cudf.core.column.NumericalColumn: - """ - Returns a column with evenly spaced values within a given interval. - - Values are generated within the half-open interval [start, stop). - The first three arguments are mapped like the range built-in function, - i.e. start and step are optional. - - Parameters - ---------- - start : int/float - Start of the interval. - stop : int/float, default is None - Stop of the interval. - step : int/float, default 1 - Step width between each pair of consecutive values. - dtype : default None - Data type specifier. It is inferred from other arguments by default. - - Returns - ------- - cudf.core.column.NumericalColumn - - Examples - -------- - >>> import cudf - >>> col = cudf.core.column.arange(2, 7, 1, dtype='int16') - >>> col - - >>> cudf.Series(col) - 0 2 - 1 3 - 2 4 - 3 5 - 4 6 - dtype: int16 - """ - if stop is None: - stop = start - start = 0 - - if step is None: - step = 1 - - size = len(range(int(start), int(stop), int(step))) - if size == 0: - if dtype is None: - dtype = cudf.dtype("int64") - return cast( - cudf.core.column.NumericalColumn, column_empty(0, dtype=dtype) - ) - - return libcudf.filling.sequence( - size, - as_device_scalar(start, dtype=dtype), - as_device_scalar(step, dtype=dtype), - ) - - def full( size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None ) -> ColumnBase: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 51b661593fc..f9cf180ff44 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -342,10 +342,16 @@ def _getitem_tuple_arg(self, arg): tmp_col_name = (tmp_col_name, *extra) cantor_name = (cantor_name, *extra) other_df = DataFrame( - {tmp_col_name: column.arange(len(tmp_arg[0]))}, + { + tmp_col_name: column.as_column( + range(len(tmp_arg[0])) + ) + }, index=as_index(tmp_arg[0]), ) - columns_df[cantor_name] = column.arange(len(columns_df)) + columns_df[cantor_name] = column.as_column( + range(len(columns_df)) + ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 73e6774f5ce..fbd85fd9876 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import itertools @@ -23,7 +23,7 @@ from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like from cudf.core.abc import Serializable -from cudf.core.column.column import ColumnBase, arange, as_column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable @@ -761,7 +761,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # subsample the gather map from the full input ordering, # rather than permuting the gather map of the output. _, (ordering,), _ = self._groupby.groups( - [arange(0, len(self.obj))] + [as_column(range(0, len(self.obj)))] ) # Invert permutation from original order to groups on the # subset of entries we want. @@ -2543,7 +2543,9 @@ def _mimic_pandas_order( # result coming back from libcudf has null_count few rows than # the input, so we must produce an ordering from the full # input range. - _, (ordering,), _ = self._groupby.groups([arange(0, len(self.obj))]) + _, (ordering,), _ = self._groupby.groups( + [as_column(range(0, len(self.obj)))] + ) if self._dropna and any( c.has_nulls(include_nan=True) > 0 for c in self.grouping._key_columns diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5c33cd09ad1..e012d8e7140 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -286,9 +286,7 @@ def _num_rows(self): @_cudf_nvtx_annotate def _values(self): if len(self) > 0: - return column.arange( - self._start, self._stop, self._step, dtype=self.dtype - ) + return column.as_column(self._range, dtype=self.dtype) else: return column.column_empty(0, masked=False, dtype=self.dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5955e21fea0..2a35ac0f959 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -182,12 +182,8 @@ def _indices_from_labels(obj, labels): # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" - lhs = cudf.DataFrame( - {"__": cudf.core.column.arange(len(labels))}, index=labels - ) - rhs = cudf.DataFrame( - {"_": cudf.core.column.arange(len(obj))}, index=obj.index - ) + lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels) + rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index) return lhs.join(rhs).sort_values(by=["__", "_"])["_"] @@ -1897,10 +1893,8 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: if stride != 1: return self._gather( GatherMap.from_column_unchecked( - cudf.core.column.arange( - start, - stop=stop, - step=stride, + as_column( + range(start, stop, stride), dtype=libcudf.types.size_type_dtype, ), len(self), @@ -2541,9 +2535,9 @@ def _align_to_index( # to recover ordering after index alignment. sort_col_id = str(uuid4()) if how == "left": - lhs[sort_col_id] = cudf.core.column.arange(len(lhs)) + lhs[sort_col_id] = as_column(range(len(lhs))) elif how == "right": - rhs[sort_col_id] = cudf.core.column.arange(len(rhs)) + rhs[sort_col_id] = as_column(range(len(rhs))) result = lhs.join(rhs, how=how, sort=sort) if how in ("left", "right"): diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 20f5b7989eb..86f0c8465ba 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -232,7 +232,11 @@ def _gather_maps(self, left_cols, right_cols): key_order = list( itertools.chain.from_iterable( libcudf.copying.gather( - [cudf.core.column.arange(n, dtype=size_type_dtype)], + [ + cudf.core.column.as_column( + range(n), dtype=size_type_dtype + ) + ], map_, nullify=null, ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 489f0e74dd6..0f323dd5540 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -501,9 +501,9 @@ def __repr__(self): # TODO: Update the following two arange calls to # a single arange call once arange has support for # a vector start/end points. - indices = column.arange(start=0, stop=n, step=1) + indices = column.as_column(range(n)) indices = indices.append( - column.arange(start=len(self) - n, stop=len(self), step=1) + column.as_column(range(len(self) - n, len(self), 1)) ) preprocess = self.take(indices) else: @@ -795,7 +795,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): [ frame, cudf.DataFrame( - {"idx": cudf.Series(column.arange(len(frame)))} + {"idx": cudf.Series(column.as_column(range(len(frame))))} ), ], axis=1, @@ -807,7 +807,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): lookup_order = "_" + "_".join(map(str, lookup._data.names)) - lookup[lookup_order] = column.arange(len(lookup)) + lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] ) @@ -840,14 +840,16 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): ): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) - return column.arange(start, stop, step) + return column.as_column(range(start, stop, step)) start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) stop_values = self._compute_validity_mask( index, row_tuple.stop, max_length ) - return column.arange(start_values.min(), stop_values.max() + 1) + return column.as_column( + range(start_values.min(), stop_values.max() + 1) + ) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length) @@ -1024,7 +1026,7 @@ def __getitem__(self, index): index = np.array(index) elif isinstance(index, slice): start, stop, step = index.indices(len(self)) - index = column.arange(start, stop, step) + index = column.as_column(range(start, stop, step)) result = MultiIndex.from_frame( self.to_frame(index=False, name=range(0, self.nlevels)).take( index diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index df5a62b384e..bc1eaef86db 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -55,7 +55,6 @@ DatetimeColumn, IntervalColumn, TimeDeltaColumn, - arange, as_column, full, ) @@ -1366,7 +1365,9 @@ def map(self, arg, na_action=None) -> "Series": raise NotImplementedError( "default values in dicts are currently not supported." ) - lhs = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) + lhs = cudf.DataFrame( + {"x": self, "orig_order": as_column(range(len(self)))} + ) rhs = cudf.DataFrame( { "x": arg.keys(), @@ -1386,7 +1387,9 @@ def map(self, arg, na_action=None) -> "Series": "Reindexing only valid with" " uniquely valued Index objects" ) - lhs = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) + lhs = cudf.DataFrame( + {"x": self, "orig_order": as_column(range(len(self)))} + ) rhs = cudf.DataFrame( { "x": arg.keys(), diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8a92ea86d57..207fb469990 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION +# Copyright (c) 2020-2024, NVIDIA CORPORATION import itertools @@ -235,7 +235,7 @@ def _apply_agg_column(self, source_column, agg_name): start = as_column(start, dtype="int32") end = as_column(end, dtype="int32") - idx = cudf.core.column.arange(len(start)) + idx = as_column(range(len(start))) preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( "int32" ) @@ -531,7 +531,7 @@ def __init__(self, groupby, window, min_periods=None, center=False): def _window_to_window_sizes(self, window): if is_integer(window): return cudautils.grouped_window_sizes_from_offset( - column.arange(len(self.obj)).data_array_view(mode="read"), + as_column(range(len(self.obj))).data_array_view(mode="read"), self._group_starts, window, ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index a4b27ae19ac..3d21994a8d5 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -8,7 +8,7 @@ import cudf from cudf._lib.transform import mask_to_bools -from cudf.core.column.column import arange, as_column +from cudf.core.column.column import as_column from cudf.testing._utils import assert_eq, assert_exceptions_equal from cudf.utils import dtypes as dtypeutils @@ -552,9 +552,3 @@ def test_astype_with_aliases(alias, expect_dtype, data): gd_data = cudf.Series.from_pandas(pd_data) assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) - - -def test_arange_empty(): - result = arange(0) - assert len(result) == 0 - assert result.dtype == np.dtype(np.int64) From 27b106f832999afa5b3353aaa2adcdb695fb4a47 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 11 Jan 2024 18:32:19 -0800 Subject: [PATCH 19/23] [Java] Choose The Correct RoundingMode For Checking Decimal OutOfBounds (#14731) This PR fixes an error in the `outOfBounds` method in which the `RoundingMode` was selected based on positive values only. The RHS should be rounded towards positive infinity (ROUND_CEILING) for the lower bound and towards negative infinity (ROUND_FLOOR) for the upper bound closes #14732 Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Jason Lowe (https://github.com/jlowe) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/14731 --- .../java/ai/rapids/cudf/DecimalUtils.java | 30 +++++++------- .../java/ai/rapids/cudf/DecimalUtilsTest.java | 40 +++++++++++++++++++ 2 files changed, 55 insertions(+), 15 deletions(-) create mode 100644 java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java diff --git a/java/src/main/java/ai/rapids/cudf/DecimalUtils.java b/java/src/main/java/ai/rapids/cudf/DecimalUtils.java index 1979bd1bd5b..7a5be9b08b9 100644 --- a/java/src/main/java/ai/rapids/cudf/DecimalUtils.java +++ b/java/src/main/java/ai/rapids/cudf/DecimalUtils.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,13 +82,13 @@ public static ColumnVector lessThan(ColumnView lhs, BigDecimal rhs) { int leftScale = lhs.getType().getScale(); int leftPrecision = lhs.getType().getDecimalMaxPrecision(); - // First we have to round the scalar (rhs) to the same scale as lhs. Because this is a - // less than and it is rhs that we are rounding, we will round away from 0 (UP) - // to make sure we always return the correct value. - // For example: - // 100.1 < 100.19 - // If we rounded down the rhs 100.19 would become 100.1, and now 100.1 is not < 100.1 - BigDecimal roundedRhs = rhs.setScale(-leftScale, BigDecimal.ROUND_UP); + // First we have to round the scalar (rhs) to the same scale as lhs. + // For comparing the two values they should be the same scale, we round the value to positive infinity to maintain + // the relation. Ex: + // 10.2 < 10.29 = true, after rounding rhs to ceiling ===> 10.2 < 10.3 = true, relation is maintained + // 10.3 < 10.29 = false, after rounding rhs to ceiling ===> 10.3 < 10.3 = false, relation is maintained + // 10.1 < 10.10 = false, after rounding rhs to ceiling ===> 10.1 < 10.1 = false, relation is maintained + BigDecimal roundedRhs = rhs.setScale(-leftScale, BigDecimal.ROUND_CEILING); if (roundedRhs.precision() > leftPrecision) { // converting rhs to the same precision as lhs would result in an overflow/error, but @@ -136,13 +136,13 @@ public static ColumnVector greaterThan(ColumnView lhs, BigDecimal rhs) { int cvScale = lhs.getType().getScale(); int maxPrecision = lhs.getType().getDecimalMaxPrecision(); - // First we have to round the scalar (rhs) to the same scale as lhs. Because this is a - // greater than and it is rhs that we are rounding, we will round towards 0 (DOWN) - // to make sure we always return the correct value. - // For example: - // 100.2 > 100.19 - // If we rounded up the rhs 100.19 would become 100.2, and now 100.2 is not > 100.2 - BigDecimal roundedRhs = rhs.setScale(-cvScale, BigDecimal.ROUND_DOWN); + // First we have to round the scalar (rhs) to the same scale as lhs. + // For comparing the two values they should be the same scale, we round the value to negative infinity to maintain + // the relation. Ex: + // 10.3 > 10.29 = true, after rounding rhs to floor ===> 10.3 > 10.2 = true, relation is maintained + // 10.2 > 10.29 = false, after rounding rhs to floor ===> 10.2 > 10.2 = false, relation is maintained + // 10.1 > 10.10 = false, after rounding rhs to floor ===> 10.1 > 10.1 = false, relation is maintained + BigDecimal roundedRhs = rhs.setScale(-cvScale, BigDecimal.ROUND_FLOOR); if (roundedRhs.precision() > maxPrecision) { // converting rhs to the same precision as lhs would result in an overflow/error, but diff --git a/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java b/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java new file mode 100644 index 00000000000..a96eeda5dd7 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java @@ -0,0 +1,40 @@ +/* + * + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import java.math.BigDecimal; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; + +public class DecimalUtilsTest extends CudfTestBase { + @Test + public void testOutOfBounds() { + try (ColumnView cv = ColumnVector.fromDecimals( + new BigDecimal("-1E+3"), + new BigDecimal("1E+3"), + new BigDecimal("9E+1"), + new BigDecimal("-9E+1"), + new BigDecimal("-91")); + ColumnView expected = ColumnVector.fromBooleans(true, true, false, false, true); + ColumnView result = DecimalUtils.outOfBounds(cv, 1, -1)) { + assertColumnsAreEqual(expected, result); + } + } +} From 5c78b7ea6b75f503d5df4abc828d80a0b470a284 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Jan 2024 08:49:20 +0000 Subject: [PATCH 20/23] Fix logic bug introduced in #14730 (#14742) The removal of `valid_count` on columns in #14730 had one logic bug, fixed here. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14742 --- python/cudf/cudf/core/column/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f52621dc444..6b3ee0ba852 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1380,7 +1380,7 @@ def _concat( # Find the first non-null column: head = next( - (obj for obj in objs if not obj.null_count != len(obj)), objs[0] + (obj for obj in objs if obj.null_count != len(obj)), objs[0] ) # Combine and de-dupe the categories From 7ca988f207730a3ae936e90d0104c4e6a14749ff Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 12 Jan 2024 12:22:58 -0600 Subject: [PATCH 21/23] Fix ``Groupby.get_group`` (#14728) Closes https://github.com/rapidsai/cudf/issues/14727 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Charles Blackmon-Luca (https://github.com/charlesbluca) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/14728 --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/tests/groupby/test_indexing.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fbd85fd9876..4e8947652ff 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -377,7 +377,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self.obj - return obj.loc[self.groups[name]] + return obj.loc[self.groups[name].drop_duplicates()] @_cudf_nvtx_annotate def size(self): diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_indexing.py index 06777c8e6af..57e8bc1c2d8 100644 --- a/python/cudf/cudf/tests/groupby/test_indexing.py +++ b/python/cudf/cudf/tests/groupby/test_indexing.py @@ -1 +1,12 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +import cudf +from cudf.testing._utils import assert_eq + + +def test_rank_return_type_compatible_mode(): + # in compatible mode, rank() always returns floats + df = cudf.DataFrame({"a": range(10), "b": [0] * 10}, index=[0] * 10) + pdf = df.to_pandas() + expect = pdf.groupby("b").get_group(0) + result = df.groupby("b").get_group(0) + assert_eq(expect, result) From 07103355fea0fb3fd0e1115019bbac7d65bb132f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 15 Jan 2024 09:33:11 +0100 Subject: [PATCH 22/23] Refactoring of Buffers (last step towards unifying COW and Spilling) (#13801) This PR de-couples buffer slices/views from owning buffers. As it is now, all buffer classes (`ExposureTrackedBuffer`, `BufferSlice`, `SpillableBuffer`, `SpillableBufferSlice`) inherent from `Buffer`, however they are not Liskov substitutable as pointed by @wence- and @vyasr ([here](https://github.com/rapidsai/cudf/pull/13307#discussion_r1189748484) and [here](https://github.com/rapidsai/cudf/pull/13307#discussion_r1239014158)). To fix this, we now have a `Buffer` and a `BufferOwner` class. We still use the `Buffer` throughout cuDF but it now points to an `BufferOwner`. We have the following class hierarchy: ``` ExposureTrackedBufferOwner -> BufferOwner SpillableBufferOwner -> BufferOwner ExposureTrackedBuffer -> Buffer SpillableBuffer -> Buffer ``` With the following relationship: ``` Buffer -> BufferOwner ExposureTrackedBuffer -> ExposureTrackedBufferOwner SpillableBuffer -> SpillableBufferOwner ``` #### Unify COW and Spilling In a follow-up PR, the spilling buffer classes will inherent from the exposure tracked buffer classes so we get the following hierarchy: ``` SpillableBufferOwner -> ExposureTrackedBufferOwner -> BufferOwner SpillableBuffer -> ExposureTrackedBuffer -> Buffer ``` Authors: - Mads R. B. Kristensen (https://github.com/madsbk) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/13801 --- .../source/developer_guide/library_design.md | 14 +- python/cudf/cudf/core/abc.py | 10 +- python/cudf/cudf/core/buffer/__init__.py | 8 +- python/cudf/cudf/core/buffer/buffer.py | 285 ++++++++++----- .../core/buffer/exposure_tracked_buffer.py | 261 ++------------ python/cudf/cudf/core/buffer/spill_manager.py | 14 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 325 +++++------------- python/cudf/cudf/core/buffer/utils.py | 105 +++++- python/cudf/cudf/tests/test_buffer.py | 13 +- python/cudf/cudf/tests/test_copying.py | 15 +- python/cudf/cudf/tests/test_spilling.py | 31 +- 11 files changed, 483 insertions(+), 598 deletions(-) diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md index 016c2c1d281..0b37de00f6b 100644 --- a/docs/cudf/source/developer_guide/library_design.md +++ b/docs/cudf/source/developer_guide/library_design.md @@ -325,26 +325,26 @@ This section describes the internal implementation details of the copy-on-write It is recommended that developers familiarize themselves with [the user-facing documentation](copy-on-write-user-doc) of this functionality before reading through the internals below. -The core copy-on-write implementation relies on the factory function `as_exposure_tracked_buffer` and the two classes `ExposureTrackedBuffer` and `BufferSlice`. +The core copy-on-write implementation relies on `ExposureTrackedBuffer` and the tracking features of `BufferOwner`. -An `ExposureTrackedBuffer` is a subclass of the regular `Buffer` that tracks internal and external references to its underlying memory. Internal references are tracked by maintaining [weak references](https://docs.python.org/3/library/weakref.html) to every `BufferSlice` of the underlying memory. External references are tracked through "exposure" status of the underlying memory. A buffer is considered exposed if the device pointer (integer or void*) has been handed out to a library outside of cudf. In this case, we have no way of knowing if the data are being modified by a third party. +`BufferOwner` tracks internal and external references to its underlying memory. Internal references are tracked by maintaining [weak references](https://docs.python.org/3/library/weakref.html) to every `ExposureTrackedBuffer` of the underlying memory. External references are tracked through "exposure" status of the underlying memory. A buffer is considered exposed if the device pointer (integer or void*) has been handed out to a library outside of cudf. In this case, we have no way of knowing if the data are being modified by a third party. -`BufferSlice` is a subclass of `ExposureTrackedBuffer` that represents a _slice_ of the memory underlying a exposure tracked buffer. +`ExposureTrackedBuffer` is a subclass of `Buffer` that represents a _slice_ of the memory underlying an exposure tracked buffer. -When the cudf option `"copy_on_write"` is `True`, `as_buffer` calls `as_exposure_tracked_buffer`, which always returns a `BufferSlice`. It is then the slices that determine whether or not to make a copy when a write operation is performed on a `Column` (see below). If multiple slices point to the same underlying memory, then a copy must be made whenever a modification is attempted. +When the cudf option `"copy_on_write"` is `True`, `as_buffer` returns a `ExposureTrackedBuffer`. It is this class that determines whether or not to make a copy when a write operation is performed on a `Column` (see below). If multiple slices point to the same underlying memory, then a copy must be made whenever a modification is attempted. ### Eager copies when exposing to third-party libraries -If a `Column`/`BufferSlice` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever +If a `Column`/`ExposureTrackedBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling -`.make_single_owner_inplace` which ensures a true copy of underlying data is made and that the slice is the sole owner. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`BufferSlice` as exposed thus indicating that any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy. +`.make_single_owner_inplace` which ensures a true copy of underlying data is made and that the slice is the sole owner. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`ExposureTrackedBuffer` as exposed thus indicating that any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy. ### Obtaining a read-only object A read-only object can be quite useful for operations that will not mutate the data. This can be achieved by calling `.get_ptr(mode="read")`, and using `cuda_array_interface_wrapper` to wrap a `__cuda_array_interface__` object around it. -This will not trigger a deep copy even if multiple `BufferSlice` points to the same `ExposureTrackedBuffer`. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host +This will not trigger a deep copy even if multiple `ExposureTrackedBuffer`s point to the same `ExposureTrackedBufferOwner`. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`. diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index adf9fe39e4f..ce6bb83bc77 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" import pickle @@ -89,7 +89,13 @@ def device_serialize(self): """ header, frames = self.serialize() assert all( - isinstance(f, (cudf.core.buffer.Buffer, memoryview)) + isinstance( + f, + ( + cudf.core.buffer.Buffer, + memoryview, + ), + ) for f in frames ) header["type-serialized"] = pickle.dumps(type(self)) diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py index d8883bd97e5..9b9774c12be 100644 --- a/python/cudf/cudf/core/buffer/__init__.py +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -1,6 +1,10 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.buffer import ( + Buffer, + BufferOwner, + cuda_array_interface_wrapper, +) from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock from cudf.core.buffer.utils import ( diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 59d20a2784d..8d278c9c065 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations import math import pickle +import weakref from types import SimpleNamespace from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple @@ -90,15 +91,31 @@ def cuda_array_interface_wrapper( ) -class Buffer(Serializable): - """A Buffer represents device memory. +class BufferOwner(Serializable): + """An owning buffer that represents device memory. - Use the factory function `as_buffer` to create a Buffer instance. + This class isn't meant to be used throughout cuDF. Instead, it + standardizes data owning by wrapping any data object that + represents device memory. Multiple `Buffer` instances, which are + the ones used throughout cuDF, can then refer to the same + `BufferOwner` instance. + + In order to implement copy-on-write and spillable buffers, we need the + ability to detect external access to the underlying memory. We say that + the buffer has been exposed if the device pointer (integer or void*) has + been accessed outside of BufferOwner. In this case, we have no control + over knowing if the data is being modified by a third party. + + Use `_from_device_memory` and `_from_host_memory` to create + a new instance from either device or host memory respectively. """ _ptr: int _size: int _owner: object + _exposed: bool + # The set of buffers that point to this owner. + _slices: weakref.WeakSet[Buffer] def __init__(self): raise ValueError( @@ -107,8 +124,8 @@ def __init__(self): ) @classmethod - def _from_device_memory(cls, data: Any) -> Self: - """Create a Buffer from an object exposing `__cuda_array_interface__`. + def _from_device_memory(cls, data: Any, exposed: bool) -> Self: + """Create from an object providing a `__cuda_array_interface__`. No data is being copied. @@ -116,16 +133,29 @@ def _from_device_memory(cls, data: Any) -> Self: ---------- data : device-buffer-like An object implementing the CUDA Array Interface. + exposed : bool + Mark the buffer as permanently exposed. This is used by + ExposureTrackedBuffer to determine when a deep copy is required + and by SpillableBuffer to mark the buffer unspillable. Returns ------- - Buffer - Buffer representing the same device memory as `data` + BufferOwner + BufferOwner wrapping `data` + + Raises + ------ + AttributeError + If data does not support the cuda array interface + ValueError + If the resulting buffer has negative size """ # Bypass `__init__` and initialize attributes manually ret = cls.__new__(cls) ret._owner = data + ret._exposed = exposed + ret._slices = weakref.WeakSet() if isinstance(data, rmm.DeviceBuffer): # Common case shortcut ret._ptr = data.ptr ret._size = data.size @@ -139,7 +169,7 @@ def _from_device_memory(cls, data: Any) -> Self: @classmethod def _from_host_memory(cls, data: Any) -> Self: - """Create a Buffer from a buffer or array like object + """Create an owner from a buffer or array like object Data must implement `__array_interface__`, the buffer protocol, and/or be convertible to a buffer object using `numpy.array()` @@ -155,8 +185,8 @@ def _from_host_memory(cls, data: Any) -> Self: Returns ------- - Buffer - Buffer representing a copy of `data`. + BufferOwner + BufferOwner wrapping a device copy of `data`. """ # Convert to numpy array, this will not copy data in most cases. @@ -166,54 +196,7 @@ def _from_host_memory(cls, data: Any) -> Self: # Copy to device memory buf = rmm.DeviceBuffer(ptr=ptr, size=size) # Create from device memory - return cls._from_device_memory(buf) - - def _getitem(self, offset: int, size: int) -> Self: - """ - Sub-classes can overwrite this to implement __getitem__ - without having to handle non-slice inputs. - """ - return self._from_device_memory( - cuda_array_interface_wrapper( - ptr=self.get_ptr(mode="read") + offset, - size=size, - owner=self.owner, - ) - ) - - def __getitem__(self, key: slice) -> Self: - """Create a new slice of the buffer.""" - if not isinstance(key, slice): - raise TypeError( - "Argument 'key' has incorrect type " - f"(expected slice, got {key.__class__.__name__})" - ) - start, stop, step = key.indices(self.size) - if step != 1: - raise ValueError("slice must be C-contiguous") - return self._getitem(offset=start, size=stop - start) - - def copy(self, deep: bool = True) -> Self: - """ - Return a copy of Buffer. - - Parameters - ---------- - deep : bool, default True - If True, returns a deep copy of the underlying Buffer data. - If False, returns a shallow copy of the Buffer pointing to - the same underlying data. - - Returns - ------- - Buffer - """ - if deep: - return self._from_device_memory( - rmm.DeviceBuffer(ptr=self.get_ptr(mode="read"), size=self.size) - ) - else: - return self[:] + return cls._from_device_memory(buf, exposed=False) @property def size(self) -> int: @@ -226,20 +209,29 @@ def nbytes(self) -> int: return self._size @property - def owner(self) -> Any: + def owner(self) -> object: """Object owning the memory of the buffer.""" return self._owner @property - def __cuda_array_interface__(self) -> Mapping: - """Implementation of the CUDA Array Interface.""" - return { - "data": (self.get_ptr(mode="write"), False), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } + def exposed(self) -> bool: + """The current exposure status of the buffer + + This is used by ExposureTrackedBuffer to determine when a deep copy + is required and by SpillableBuffer to mark the buffer unspillable. + """ + return self._exposed + + def mark_exposed(self) -> None: + """Mark the buffer as "exposed" permanently + + This is used by ExposureTrackedBuffer to determine when a deep copy + is required and by SpillableBuffer to mark the buffer unspillable. + + Notice, once the exposure status becomes True, it will never change + back. + """ + self._exposed = True def get_ptr(self, *, mode: Literal["read", "write"]) -> int: """Device pointer to the start of the buffer. @@ -277,20 +269,148 @@ def memoryview( ) return memoryview(host_buf).toreadonly() + def __str__(self) -> str: + return ( + f"<{self.__class__.__name__} size={format_bytes(self._size)} " + f"ptr={hex(self._ptr)} owner={self._owner!r}>" + ) + + +class Buffer(Serializable): + """A buffer that represents a slice or view of a `BufferOwner`. + + Use the factory function `as_buffer` to create a Buffer instance. + + Note + ---- + This buffer is untyped, so all indexing and sizes are in bytes. + + Parameters + ---------- + owner + The owning exposure buffer this refers to. + offset + The offset relative to the start memory of owner (in bytes). + size + The size of the buffer (in bytes). If None, use the size of owner. + """ + + def __init__( + self, + *, + owner: BufferOwner, + offset: int = 0, + size: Optional[int] = None, + ) -> None: + size = owner.size if size is None else size + if size < 0: + raise ValueError("size cannot be negative") + if offset < 0: + raise ValueError("offset cannot be negative") + if offset + size > owner.size: + raise ValueError( + "offset+size cannot be greater than the size of owner" + ) + self._owner = owner + self._offset = offset + self._size = size + + @property + def size(self) -> int: + """Size of the buffer in bytes.""" + return self._size + + @property + def nbytes(self) -> int: + """Size of the buffer in bytes.""" + return self._size + + @property + def owner(self) -> BufferOwner: + """Object owning the memory of the buffer.""" + return self._owner + + def __getitem__(self, key: slice) -> Self: + """Create a new slice of the buffer.""" + if not isinstance(key, slice): + raise TypeError( + "Argument 'key' has incorrect type " + f"(expected slice, got {key.__class__.__name__})" + ) + start, stop, step = key.indices(self.size) + if step != 1: + raise ValueError("slice must be C-contiguous") + return self.__class__( + owner=self._owner, offset=self._offset + start, size=stop - start + ) + + def get_ptr(self, *, mode: Literal["read", "write"]) -> int: + return self._owner.get_ptr(mode=mode) + self._offset + + def memoryview(self) -> memoryview: + return self._owner.memoryview(offset=self._offset, size=self._size) + + def copy(self, deep: bool = True) -> Self: + """Return a copy of Buffer. + + Parameters + ---------- + deep : bool, default True + - If deep=True, returns a deep copy of the underlying data. + - If deep=False, returns a new `Buffer` instance that refers + to the same `BufferOwner` as this one. Thus, no device + data are being copied. + + Returns + ------- + Buffer + A new buffer that either refers to either a new or an existing + `BufferOwner` depending on the `deep` argument (see above). + """ + + # When doing a shallow copy, we just return a new slice + if not deep: + return self.__class__( + owner=self._owner, offset=self._offset, size=self._size + ) + + # Otherwise, we create a new copy of the memory + owner = self._owner._from_device_memory( + rmm.DeviceBuffer( + ptr=self._owner.get_ptr(mode="read") + self._offset, + size=self.size, + ), + exposed=False, + ) + return self.__class__(owner=owner, offset=0, size=owner.size) + + @property + def __cuda_array_interface__(self) -> Mapping: + """Implementation of the CUDA Array Interface.""" + return { + "data": (self.get_ptr(mode="write"), False), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } + def serialize(self) -> Tuple[dict, list]: """Serialize the buffer into header and frames. - The frames can be a mixture of memoryview and Buffer objects. + The frames can be a mixture of memoryview, Buffer, and BufferOwner + objects. Returns ------- Tuple[dict, List] The first element of the returned tuple is a dict containing any serializable metadata required to reconstruct the object. The - second element is a list containing Buffers and memoryviews. + second element is a list containing single frame. """ header: Dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) + header["owner-type-serialized"] = pickle.dumps(type(self._owner)) header["frame_count"] = 1 frames = [self] return header, frames @@ -317,16 +437,27 @@ def deserialize(cls, header: dict, frames: list) -> Self: if isinstance(frame, cls): return frame # The frame is already deserialized + owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"]) if hasattr(frame, "__cuda_array_interface__"): - return cls._from_device_memory(frame) - return cls._from_host_memory(frame) + owner = owner_type._from_device_memory(frame, exposed=False) + else: + owner = owner_type._from_host_memory(frame) + return cls( + owner=owner, + offset=0, + size=owner.size, + ) def __repr__(self) -> str: - klass = self.__class__ - name = f"{klass.__module__}.{klass.__qualname__}" return ( - f"<{name} size={format_bytes(self._size)} " - f"ptr={hex(self._ptr)} owner={repr(self._owner)}>" + f"{self.__class__.__name__}(owner={self._owner!r}, " + f"offset={self._offset!r}, size={self._size!r})" + ) + + def __str__(self) -> str: + return ( + f"<{self.__class__.__name__} size={format_bytes(self._size)} " + f"offset={format_bytes(self._offset)} of {self._owner}>" ) diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index f2ac6301944..4c08016adbb 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -1,241 +1,47 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations -import weakref -from typing import ( - Any, - Container, - Literal, - Mapping, - Optional, - Type, - TypeVar, - cast, -) +from typing import Literal, Mapping, Optional from typing_extensions import Self import cudf -from cudf.core.buffer.buffer import Buffer, get_ptr_and_size -from cudf.utils.string import format_bytes - -T = TypeVar("T", bound="ExposureTrackedBuffer") - - -def get_owner(data, klass: Type[T]) -> Optional[T]: - """Get the owner of `data`, if any exist - - Search through the stack of data owners in order to find an - owner of type `klass` (not subclasses). - - Parameters - ---------- - data - The data object - - Return - ------ - klass or None - The owner of `data` if `klass` or None. - """ - - if type(data) is klass: - return data - if hasattr(data, "owner"): - return get_owner(data.owner, klass) - return None - - -def as_exposure_tracked_buffer( - data, exposed: bool, subclass: Optional[Type[T]] = None -) -> BufferSlice: - """Factory function to wrap `data` in a slice of an exposure tracked buffer - - If `subclass` is None, a new ExposureTrackedBuffer that points to the - memory of `data` is created and a BufferSlice that points to all of the - new ExposureTrackedBuffer is returned. - - If `subclass` is not None, a new `subclass` is created instead. Still, - a BufferSlice that points to all of the new `subclass` is returned - - It is illegal for an exposure tracked buffer to own another exposure - tracked buffer. When representing the same memory, we should have a single - exposure tracked buffer and multiple buffer slices. - - Developer Notes - --------------- - This function always returns slices thus all buffers in cudf will use - `BufferSlice` when copy-on-write is enabled. The slices implement - copy-on-write by trigging deep copies when write access is detected - and multiple slices points to the same exposure tracked buffer. - - Parameters - ---------- - data : buffer-like or array-like - A buffer-like or array-like object that represents C-contiguous memory. - exposed - Mark the buffer as permanently exposed. - subclass - If not None, a subclass of ExposureTrackedBuffer to wrap `data`. - - Return - ------ - BufferSlice - A buffer slice that points to a ExposureTrackedBuffer (or `subclass`), - which in turn wraps `data`. - """ - - if not hasattr(data, "__cuda_array_interface__"): - if exposed: - raise ValueError("cannot created exposed host memory") - return cast( - BufferSlice, ExposureTrackedBuffer._from_host_memory(data)[:] - ) - - owner = get_owner(data, subclass or ExposureTrackedBuffer) - if owner is None: - return cast( - BufferSlice, - ExposureTrackedBuffer._from_device_memory(data, exposed=exposed)[ - : - ], - ) - - # At this point, we know that `data` is owned by a exposure tracked buffer - ptr, size = get_ptr_and_size(data.__cuda_array_interface__) - if size > 0 and owner._ptr == 0: - raise ValueError("Cannot create a non-empty slice of a null buffer") - return BufferSlice(base=owner, offset=ptr - owner._ptr, size=size) +from cudf.core.buffer.buffer import Buffer, BufferOwner class ExposureTrackedBuffer(Buffer): - """A Buffer that tracks its "expose" status. - - In order to implement copy-on-write and spillable buffers, we need the - ability to detect external access to the underlying memory. We say that - the buffer has been exposed if the device pointer (integer or void*) has - been accessed outside of ExposureTrackedBuffer. In this case, we have no - control over knowing if the data is being modified by a third-party. - - Attributes - ---------- - _exposed - The current exposure status of the buffer. Notice, once the exposure - status becomes True, it should never change back. - _slices - The set of BufferSlice instances that point to this buffer. - """ - - _exposed: bool - _slices: weakref.WeakSet[BufferSlice] - - @property - def exposed(self) -> bool: - return self._exposed - - def mark_exposed(self) -> None: - """Mark the buffer as "exposed" permanently""" - self._exposed = True - - @classmethod - def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self: - """Create an exposure tracked buffer from device memory. - - No data is being copied. - - Parameters - ---------- - data : device-buffer-like - An object implementing the CUDA Array Interface. - exposed : bool, optional - Mark the buffer as permanently exposed. - - Returns - ------- - ExposureTrackedBuffer - Buffer representing the same device memory as `data` - """ - ret = super()._from_device_memory(data) - ret._exposed = exposed - ret._slices = weakref.WeakSet() - return ret - - def _getitem(self, offset: int, size: int) -> BufferSlice: - return BufferSlice(base=self, offset=offset, size=size) - - @property - def __cuda_array_interface__(self) -> Mapping: - self.mark_exposed() - return super().__cuda_array_interface__ - - def __repr__(self) -> str: - return ( - f"" - ) - - -class BufferSlice(ExposureTrackedBuffer): - """A slice (aka. a view) of a exposure tracked buffer. + """An exposure tracked buffer. Parameters ---------- - base - The exposure tracked buffer this slice refers to. + owner + The owning exposure tracked buffer this refers to. offset - The offset relative to the start memory of base (in bytes). + The offset relative to the start memory of owner (in bytes). size The size of the slice (in bytes) - passthrough_attributes - Name of attributes that are passed through to the base as-is. """ + _owner: BufferOwner + def __init__( self, - base: ExposureTrackedBuffer, - offset: int, - size: int, - *, - passthrough_attributes: Container[str] = ("exposed",), + owner: BufferOwner, + offset: int = 0, + size: Optional[int] = None, ) -> None: - if size < 0: - raise ValueError("size cannot be negative") - if offset < 0: - raise ValueError("offset cannot be negative") - if offset + size > base.size: - raise ValueError( - "offset+size cannot be greater than the size of base" - ) - self._base = base - self._offset = offset - self._size = size - self._owner = base - self._passthrough_attributes = passthrough_attributes - base._slices.add(self) - - def __getattr__(self, name): - if name in self._passthrough_attributes: - return getattr(self._base, name) - raise AttributeError( - f"{self.__class__.__name__} object has no attribute {name}" - ) + super().__init__(owner=owner, offset=offset, size=size) + self._owner._slices.add(self) - def _getitem(self, offset: int, size: int) -> BufferSlice: - return BufferSlice( - base=self._base, offset=offset + self._offset, size=size - ) + @property + def exposed(self) -> bool: + return self._owner.exposed def get_ptr(self, *, mode: Literal["read", "write"]) -> int: if mode == "write" and cudf.get_option("copy_on_write"): self.make_single_owner_inplace() - return self._base.get_ptr(mode=mode) + self._offset - - def memoryview( - self, *, offset: int = 0, size: Optional[int] = None - ) -> memoryview: - return self._base.memoryview(offset=self._offset + offset, size=size) + return super().get_ptr(mode=mode) def copy(self, deep: bool = True) -> Self: """Return a copy of Buffer. @@ -260,16 +66,14 @@ def copy(self, deep: bool = True) -> Self: Returns ------- - BufferSlice - A slice pointing to either a new or the existing base buffer - depending on the expose status of the base buffer and the + ExposureTrackedBuffer + A slice pointing to either a new or the existing owner + depending on the expose status of the owner and the copy-on-write option (see above). """ if cudf.get_option("copy_on_write"): - base_copy = self._base.copy(deep=deep or self.exposed) - else: - base_copy = self._base.copy(deep=deep) - return cast(Self, base_copy[self._offset : self._offset + self._size]) + return super().copy(deep=deep or self.exposed) + return super().copy(deep=deep) @property def __cuda_array_interface__(self) -> Mapping: @@ -278,7 +82,7 @@ def __cuda_array_interface__(self) -> Mapping: return super().__cuda_array_interface__ def make_single_owner_inplace(self) -> None: - """Make sure this slice is the only one pointing to the base. + """Make sure this slice is the only one pointing to the owner. This is used by copy-on-write to trigger a deep copy when write access is detected. @@ -294,18 +98,11 @@ def make_single_owner_inplace(self) -> None: Buffer representing the same device memory as `data` """ - if len(self._base._slices) > 1: - # If this is not the only slice pointing to `self._base`, we - # point to a new deep copy of the base. + if len(self._owner._slices) > 1: + # If this is not the only slice pointing to `self._owner`, we + # point to a new deep copy of the owner. t = self.copy(deep=True) - self._base = t._base + self._owner = t._owner self._offset = t._offset self._size = t._size - self._owner = t._base - self._base._slices.add(self) - - def __repr__(self) -> str: - return ( - f"" - ) + self._owner._slices.add(self) diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index 91f3b2cd544..3e654e01401 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -16,7 +16,7 @@ import rmm.mr -from cudf.core.buffer.spillable_buffer import SpillableBuffer +from cudf.core.buffer.spillable_buffer import SpillableBufferOwner from cudf.options import get_option from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.string import format_bytes @@ -128,7 +128,7 @@ def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None: total_time + time, ) - def log_expose(self, buf: SpillableBuffer) -> None: + def log_expose(self, buf: SpillableBufferOwner) -> None: """Log an expose event We track logged exposes by grouping them by their traceback such @@ -224,7 +224,7 @@ class SpillManager: SpillStatistics for the different levels. """ - _buffers: weakref.WeakValueDictionary[int, SpillableBuffer] + _buffers: weakref.WeakValueDictionary[int, SpillableBufferOwner] statistics: SpillStatistics def __init__( @@ -298,14 +298,14 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool: ) return False # Since we didn't find anything to spill, we give up - def add(self, buffer: SpillableBuffer) -> None: + def add(self, buffer: SpillableBufferOwner) -> None: """Add buffer to the set of managed buffers The manager keeps a weak reference to the buffer Parameters ---------- - buffer : SpillableBuffer + buffer : SpillableBufferOwner The buffer to manage """ if buffer.size > 0 and not buffer.exposed: @@ -316,7 +316,7 @@ def add(self, buffer: SpillableBuffer) -> None: def buffers( self, order_by_access_time: bool = False - ) -> Tuple[SpillableBuffer, ...]: + ) -> Tuple[SpillableBufferOwner, ...]: """Get all managed buffers Parameters diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 1856bec1876..aeac4b76e58 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -16,8 +16,8 @@ from cudf.core.buffer.buffer import ( Buffer, + BufferOwner, cuda_array_interface_wrapper, - get_ptr_and_size, host_memory_allocation, ) from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate @@ -27,86 +27,6 @@ from cudf.core.buffer.spill_manager import SpillManager -def get_spillable_owner(data) -> Optional[SpillableBuffer]: - """Get the spillable owner of `data`, if any exist - - Search through the stack of data owners in order to find an - owner of type `SpillableBuffer` (not subclasses). - - Parameters - ---------- - data : buffer-like or array-like - A buffer-like or array-like object that represent C-contiguous memory. - - Return - ------ - SpillableBuffer or None - The owner of `data` if spillable or None. - """ - - if type(data) is SpillableBuffer: - return data - if hasattr(data, "owner"): - return get_spillable_owner(data.owner) - return None - - -def as_spillable_buffer(data, exposed: bool) -> SpillableBuffer: - """Factory function to wrap `data` in a SpillableBuffer object. - - If `data` isn't a buffer already, a new buffer that points to the memory of - `data` is created. If `data` represents host memory, it is copied to a new - `rmm.DeviceBuffer` device allocation. Otherwise, the memory of `data` is - **not** copied, instead the new buffer keeps a reference to `data` in order - to retain its lifetime. - - If `data` is owned by a spillable buffer, a "slice" of the buffer is - returned. In this case, the spillable buffer must either be "exposed" or - spilled locked (called within an acquire_spill_lock context). This is to - guarantee that the memory of `data` isn't spilled before this function gets - to calculate the offset of the new slice. - - It is illegal for a spillable buffer to own another spillable buffer. - - Parameters - ---------- - data : buffer-like or array-like - A buffer-like or array-like object that represent C-contiguous memory. - exposed : bool, optional - Mark the buffer as permanently exposed (unspillable). - - Return - ------ - SpillableBuffer - A spillabe buffer instance that represents the device memory of `data`. - """ - - from cudf.core.buffer.utils import get_spill_lock - - if not hasattr(data, "__cuda_array_interface__"): - if exposed: - raise ValueError("cannot created exposed host memory") - return SpillableBuffer._from_host_memory(data) - - spillable_owner = get_spillable_owner(data) - if spillable_owner is None: - return SpillableBuffer._from_device_memory(data, exposed=exposed) - - if not spillable_owner.exposed and get_spill_lock() is None: - raise ValueError( - "A owning spillable buffer must " - "either be exposed or spilled locked." - ) - - # At this point, we know that `data` is owned by a spillable buffer, - # which is exposed or spilled locked. - ptr, size = get_ptr_and_size(data.__cuda_array_interface__) - base_ptr = spillable_owner.memory_info()[0] - return SpillableBufferSlice( - spillable_owner, offset=ptr - base_ptr, size=size - ) - - class SpillLock: pass @@ -141,7 +61,7 @@ def __getitem__(self, i): raise IndexError("tuple index out of range") -class SpillableBuffer(Buffer): +class SpillableBufferOwner(BufferOwner): """A Buffer that supports spilling memory off the GPU to avoid OOMs. This buffer supports spilling the represented data to host memory. @@ -150,9 +70,9 @@ class SpillableBuffer(Buffer): device memory usage see `cudf.core.buffer.spill_manager.SpillManager`. Unspill is triggered automatically when accessing the data of the buffer. - The buffer might not be spillable, which is based on the "expose" status - of the buffer. We say that the buffer has been exposed if the device - pointer (integer or void*) has been accessed outside of SpillableBuffer. + The buffer might not be spillable, which is based on the "expose" status of + the buffer. We say that the buffer has been exposed if the device pointer + (integer or void*) has been accessed outside of SpillableBufferOwner. In this case, we cannot invalidate the device pointer by moving the data to host. @@ -160,17 +80,17 @@ class SpillableBuffer(Buffer): property. To avoid this, one can use `.get_ptr()` instead, which support exposing the buffer temporarily. - Use the factory function `as_buffer` to create a SpillableBuffer instance. + Use the factory function `as_buffer` to create a SpillableBufferOwner + instance. """ lock: RLock _spill_locks: weakref.WeakSet _last_accessed: float _ptr_desc: Dict[str, Any] - _exposed: bool _manager: SpillManager - def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None: + def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None: """Finish initialization of the spillable buffer This implements the common initialization that `_from_device_memory` @@ -180,8 +100,6 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None: ---------- ptr_desc : dict Description of the memory. - exposed : bool, optional - Mark the buffer as permanently exposed (unspillable). """ from cudf.core.buffer.spill_manager import get_global_manager @@ -190,7 +108,6 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None: self._spill_locks = weakref.WeakSet() self._last_accessed = time.monotonic() self._ptr_desc = ptr_desc - self._exposed = exposed manager = get_global_manager() if manager is None: raise ValueError( @@ -202,7 +119,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None: self._manager.add(self) @classmethod - def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self: + def _from_device_memory(cls, data: Any, exposed: bool) -> Self: """Create a spillabe buffer from device memory. No data is being copied. @@ -211,16 +128,16 @@ def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self: ---------- data : device-buffer-like An object implementing the CUDA Array Interface. - exposed : bool, optional + exposed : bool Mark the buffer as permanently exposed (unspillable). Returns ------- - SpillableBuffer + SpillableBufferOwner Buffer representing the same device memory as `data` """ - ret = super()._from_device_memory(data) - ret._finalize_init(ptr_desc={"type": "gpu"}, exposed=exposed) + ret = super()._from_device_memory(data, exposed=exposed) + ret._finalize_init(ptr_desc={"type": "gpu"}) return ret @classmethod @@ -241,7 +158,7 @@ def _from_host_memory(cls, data: Any) -> Self: Returns ------- - SpillableBuffer + SpillableBufferOwner Buffer representing a copy of `data`. """ @@ -257,20 +174,14 @@ def _from_host_memory(cls, data: Any) -> Self: ret._owner = None ret._ptr = 0 ret._size = data.nbytes - ret._finalize_init( - ptr_desc={"type": "cpu", "memoryview": data}, exposed=False - ) + ret._exposed = False + ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data}) return ret @property def is_spilled(self) -> bool: return self._ptr_desc["type"] != "gpu" - def copy(self, deep: bool = True) -> Self: - spill_lock = SpillLock() - self.spill_lock(spill_lock=spill_lock) - return super().copy(deep=deep) - def spill(self, target: str = "cpu") -> None: """Spill or un-spill this buffer in-place @@ -343,10 +254,10 @@ def mark_exposed(self) -> None: self._manager.spill_to_device_limit() with self.lock: - if not self._exposed: + if not self.exposed: self._manager.statistics.log_expose(self) self.spill(target="gpu") - self._exposed = True + super().mark_exposed() self._last_accessed = time.monotonic() def spill_lock(self, spill_lock: SpillLock) -> None: @@ -415,25 +326,9 @@ def memory_info(self) -> Tuple[int, int, str]: ).__array_interface__["data"][0] return (ptr, self.nbytes, self._ptr_desc["type"]) - @property - def owner(self) -> Any: - return self._owner - - @property - def exposed(self) -> bool: - return self._exposed - @property def spillable(self) -> bool: - return not self._exposed and len(self._spill_locks) == 0 - - @property - def size(self) -> int: - return self._size - - @property - def nbytes(self) -> int: - return self._size + return not self.exposed and len(self._spill_locks) == 0 @property def last_accessed(self) -> float: @@ -465,148 +360,114 @@ def memoryview( ) return ret - def _getitem(self, offset: int, size: int) -> SpillableBufferSlice: - return SpillableBufferSlice(base=self, offset=offset, size=size) - - def serialize(self) -> Tuple[dict, list]: - """Serialize the Buffer - - Normally, we would use `[self]` as the frames. This would work but - also mean that `self` becomes exposed permanently if the frames are - later accessed through `__cuda_array_interface__`, which is exactly - what libraries like Dask+UCX would do when communicating! - - The sound solution is to modify Dask et al. so that they access the - frames through `.get_ptr()` and holds on to the `spill_lock` until - the frame has been transferred. However, until this adaptation we - use a hack where the frame is a `Buffer` with a `spill_lock` as the - owner, which makes `self` unspillable while the frame is alive but - doesn't expose `self` when `__cuda_array_interface__` is accessed. - - Warning, this hack means that the returned frame must be copied before - given to `.deserialize()`, otherwise we would have a `Buffer` pointing - to memory already owned by an existing `SpillableBuffer`. - """ - header: Dict[Any, Any] - frames: List[Buffer | memoryview] - with self.lock: - header = {} - header["type-serialized"] = pickle.dumps(self.__class__) - header["frame_count"] = 1 - if self.is_spilled: - frames = [self.memoryview()] - else: - # TODO: Use `frames=[self]` instead of this hack, see doc above - spill_lock = SpillLock() - self.spill_lock(spill_lock) - ptr, size, _ = self.memory_info() - frames = [ - Buffer._from_device_memory( - cuda_array_interface_wrapper( - ptr=ptr, - size=size, - owner=(self._owner, spill_lock), - ) - ) - ] - return header, frames - - def __repr__(self) -> str: + def __str__(self) -> str: if self._ptr_desc["type"] != "gpu": ptr_info = str(self._ptr_desc) else: ptr_info = str(hex(self._ptr)) return ( - f"" ) -class SpillableBufferSlice(SpillableBuffer): +class SpillableBuffer(Buffer): """A slice of a spillable buffer This buffer applies the slicing and then delegates all - operations to its base buffer. + operations to its owning buffer. Parameters ---------- - base : SpillableBuffer - The base of the view + owner : SpillableBufferOwner + The owner of the view offset : int - Memory offset into the base buffer + Memory offset into the owning buffer size : int Size of the view (in bytes) """ - def __init__(self, base: SpillableBuffer, offset: int, size: int) -> None: - if size < 0: - raise ValueError("size cannot be negative") - if offset < 0: - raise ValueError("offset cannot be negative") - if offset + size > base.size: - raise ValueError( - "offset+size cannot be greater than the size of base" - ) - self._base = base - self._offset = offset - self._size = size - self._owner = base - self.lock = base.lock - - def get_ptr(self, *, mode: Literal["read", "write"]) -> int: - """ - A passthrough method to `SpillableBuffer.get_ptr` - with factoring in the `offset`. - """ - return self._base.get_ptr(mode=mode) + self._offset - - def _getitem(self, offset: int, size: int) -> SpillableBufferSlice: - return SpillableBufferSlice( - base=self._base, offset=offset + self._offset, size=size - ) + _owner: SpillableBufferOwner - @classmethod - def deserialize(cls, header: dict, frames: list): - # TODO: because of the hack in `SpillableBuffer.serialize()` where - # frames are of type `Buffer`, we always deserialize as if they are - # `SpillableBuffer`. In the future, we should be able to - # deserialize into `SpillableBufferSlice` when the frames hasn't been - # copied. - return SpillableBuffer.deserialize(header, frames) - - def memoryview( - self, *, offset: int = 0, size: Optional[int] = None - ) -> memoryview: - size = self._size if size is None else size - return self._base.memoryview(offset=self._offset + offset, size=size) - - def __repr__(self) -> str: - return ( - f" None: - return self._base.spill(target=target) + return self._owner.spill(target=target) @property def is_spilled(self) -> bool: - return self._base.is_spilled + return self._owner.is_spilled @property def exposed(self) -> bool: - return self._base.exposed + return self._owner.exposed @property def spillable(self) -> bool: - return self._base.spillable + return self._owner.spillable def spill_lock(self, spill_lock: SpillLock) -> None: - self._base.spill_lock(spill_lock=spill_lock) + self._owner.spill_lock(spill_lock=spill_lock) def memory_info(self) -> Tuple[int, int, str]: - (ptr, _, device_type) = self._base.memory_info() + (ptr, _, device_type) = self._owner.memory_info() return (ptr + self._offset, self.nbytes, device_type) + + def mark_exposed(self) -> None: + self._owner.mark_exposed() + + def serialize(self) -> Tuple[dict, list]: + """Serialize the Buffer + + Normally, we would use `[self]` as the frames. This would work but + also mean that `self` becomes exposed permanently if the frames are + later accessed through `__cuda_array_interface__`, which is exactly + what libraries like Dask+UCX would do when communicating! + + The sound solution is to modify Dask et al. so that they access the + frames through `.get_ptr()` and holds on to the `spill_lock` until + the frame has been transferred. However, until this adaptation we + use a hack where the frame is a `Buffer` with a `spill_lock` as the + owner, which makes `self` unspillable while the frame is alive but + doesn't expose `self` when `__cuda_array_interface__` is accessed. + + Warning, this hack means that the returned frame must be copied before + given to `.deserialize()`, otherwise we would have a `Buffer` pointing + to memory already owned by an existing `SpillableBufferOwner`. + """ + header: Dict[str, Any] = {} + frames: List[Buffer | memoryview] + with self._owner.lock: + header["type-serialized"] = pickle.dumps(self.__class__) + header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["frame_count"] = 1 + if self.is_spilled: + frames = [self.memoryview()] + else: + # TODO: Use `frames=[self]` instead of this hack, see doc above + spill_lock = SpillLock() + self.spill_lock(spill_lock) + ptr, size, _ = self.memory_info() + frames = [ + Buffer( + owner=BufferOwner._from_device_memory( + cuda_array_interface_wrapper( + ptr=ptr, + size=size, + owner=(self._owner, spill_lock), + ), + exposed=False, + ) + ) + ] + return header, frames + + @property + def __cuda_array_interface__(self) -> dict: + return { + "data": DelayedPointerTuple(self), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 373be99ec96..c2ec7effd13 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -1,18 +1,51 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from __future__ import annotations import threading from contextlib import ContextDecorator -from typing import Any, Dict, Optional, Tuple, Union - -from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper -from cudf.core.buffer.exposure_tracked_buffer import as_exposure_tracked_buffer +from typing import Any, Dict, Optional, Tuple, Type, Union + +from cudf.core.buffer.buffer import ( + Buffer, + BufferOwner, + cuda_array_interface_wrapper, + get_ptr_and_size, +) +from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer from cudf.core.buffer.spill_manager import get_global_manager -from cudf.core.buffer.spillable_buffer import SpillLock, as_spillable_buffer +from cudf.core.buffer.spillable_buffer import ( + SpillableBuffer, + SpillableBufferOwner, + SpillLock, +) from cudf.options import get_option +def get_buffer_owner(data: Any) -> Optional[BufferOwner]: + """Get the owner of `data`, if one exists + + Search through the stack of data owners in order to find an + owner BufferOwner (incl. subclasses). + + Parameters + ---------- + data + The data object to search for a BufferOwner instance + + Return + ------ + BufferOwner or None + The owner of `data` if found otherwise None. + """ + + if isinstance(data, BufferOwner): + return data + if hasattr(data, "owner"): + return get_buffer_owner(data.owner) + return None + + def as_buffer( data: Union[int, Any], *, @@ -30,7 +63,17 @@ def as_buffer( If `data` is an integer, it is assumed to point to device memory. - Raises ValueError if data isn't C-contiguous. + Raises ValueError if `data` isn't C-contiguous. + + If copy-on-write is enabled, an ExposureTrackedBuffer is returned. + + If spilling is enabled, a SpillableBuffer that refers to a + SpillableBufferOwner is returned. If `data` is owned by a spillable buffer, + it must either be "exposed" or spill locked (called within an + acquire_spill_lock context). This is to guarantee that the memory of `data` + isn't spilled before this function gets to calculate the offset of the new + SpillableBuffer. + Parameters ---------- @@ -73,13 +116,49 @@ def as_buffer( "`data` is a buffer-like or array-like object" ) - if get_option("copy_on_write"): - return as_exposure_tracked_buffer(data, exposed=exposed) + # Find the buffer types to return based on the current config + owner_class: Type[BufferOwner] + buffer_class: Type[Buffer] if get_global_manager() is not None: - return as_spillable_buffer(data, exposed=exposed) - if hasattr(data, "__cuda_array_interface__"): - return Buffer._from_device_memory(data) - return Buffer._from_host_memory(data) + owner_class = SpillableBufferOwner + buffer_class = SpillableBuffer + elif get_option("copy_on_write"): + owner_class = BufferOwner + buffer_class = ExposureTrackedBuffer + else: + owner_class = BufferOwner + buffer_class = Buffer + + # Handle host memory, + if not hasattr(data, "__cuda_array_interface__"): + if exposed: + raise ValueError("cannot created exposed host memory") + return buffer_class(owner=owner_class._from_host_memory(data)) + + # Check if `data` is owned by a known class + owner = get_buffer_owner(data) + if owner is None: # `data` is new device memory + return buffer_class( + owner=owner_class._from_device_memory(data, exposed=exposed) + ) + + # At this point, we know that `data` is owned by a known class, which + # should be the same class as specified by the current config (see above) + assert owner.__class__ is owner_class + if ( + isinstance(owner, SpillableBufferOwner) + and not owner.exposed + and get_spill_lock() is None + ): + raise ValueError( + "An owning spillable buffer must " + "either be exposed or spill locked." + ) + ptr, size = get_ptr_and_size(data.__cuda_array_interface__) + base_ptr = owner.get_ptr(mode="read") + if size > 0 and base_ptr == 0: + raise ValueError("Cannot create a non-empty slice of a null buffer") + return buffer_class(owner=owner, offset=ptr - base_ptr, size=size) _thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {} diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 1c9e7475080..03637e05eae 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cupy as cp import pytest @@ -64,7 +64,14 @@ def test_buffer_creation_from_any(): assert isinstance(b, Buffer) assert ary.data.ptr == b.get_ptr(mode="read") assert ary.nbytes == b.size - assert b.owner.owner is ary + assert b.owner.owner.owner is ary + + +@pytest.mark.parametrize("size", [10, 2**10 + 500, 2**20]) +def test_buffer_str(size): + ary = cp.arange(size, dtype="uint8") + buf = as_buffer(ary) + assert f"size={size}" in repr(buf) @pytest.mark.parametrize( @@ -73,7 +80,7 @@ def test_buffer_creation_from_any(): def test_buffer_repr(size, expect): ary = cp.arange(size, dtype="uint8") buf = as_buffer(ary) - assert f"size={expect}" in repr(buf) + assert f"size={expect}" in str(buf) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 085774e9dbc..e737a73e86b 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -113,11 +113,8 @@ def test_series_setitem_partial_slice_cow_on(): assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) new_slice = actual[2:] - # TODO: when COW and spilling has been unified, find a clean way to - # test this without accessing the internal attributes _base and _ptr assert ( - new_slice._column.base_data._base._ptr - == actual._column.base_data._base._ptr + new_slice._column.base_data.owner == actual._column.base_data.owner ) new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) @@ -134,9 +131,11 @@ def test_series_setitem_partial_slice_cow_off(): assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) new_slice = actual[2:] - assert ( - new_slice._column.base_data._ptr == actual._column.base_data._ptr - ) + # Since COW is off, a slice should point to the same memory + ptr1 = new_slice._column.base_data.get_ptr(mode="read") + ptr2 = actual._column.base_data.get_ptr(mode="read") + assert ptr1 == ptr2 + new_slice[0:2] = 10 assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 88ce908aa5f..7e66a7ab4ba 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import importlib import random @@ -34,7 +34,7 @@ ) from cudf.core.buffer.spillable_buffer import ( SpillableBuffer, - SpillableBufferSlice, + SpillableBufferOwner, SpillLock, ) from cudf.testing._utils import assert_eq @@ -196,10 +196,10 @@ def test_creations(manager: SpillManager): def test_spillable_df_groupby(manager: SpillManager): df = cudf.DataFrame({"a": [1, 1, 1]}) gb = df.groupby("a") - assert len(single_column_df_base_data(df)._spill_locks) == 0 + assert len(single_column_df_base_data(df).owner._spill_locks) == 0 gb._groupby # `gb._groupby`, which is cached on `gb`, holds a spill lock - assert len(single_column_df_base_data(df)._spill_locks) == 1 + assert len(single_column_df_base_data(df).owner._spill_locks) == 1 assert not single_column_df_data(df).spillable del gb assert single_column_df_data(df).spillable @@ -375,7 +375,7 @@ def test_get_ptr(manager: SpillManager, target): mem = np.empty(10, dtype="u1") buf = as_buffer(data=mem, exposed=False) assert buf.spillable - assert len(buf._spill_locks) == 0 + assert len(buf.owner._spill_locks) == 0 with acquire_spill_lock(): buf.get_ptr(mode="read") assert not buf.spillable @@ -496,8 +496,8 @@ def test_serialize_cuda_dataframe(manager: SpillManager): header, frames = protocol.serialize( df1, serializers=("cuda",), on_error="raise" ) - buf: SpillableBufferSlice = single_column_df_data(df1) - assert len(buf._base._spill_locks) == 1 + buf: SpillableBuffer = single_column_df_data(df1) + assert len(buf.owner._spill_locks) == 1 assert len(frames) == 1 assert isinstance(frames[0], Buffer) assert frames[0].get_ptr(mode="read") == buf.get_ptr(mode="read") @@ -543,13 +543,14 @@ def test_as_buffer_of_spillable_buffer(manager: SpillManager): data = cupy.arange(10, dtype="u1") b1 = as_buffer(data, exposed=False) assert isinstance(b1, SpillableBuffer) - assert b1.owner is data + assert isinstance(b1.owner, SpillableBufferOwner) + assert b1.owner.owner is data b2 = as_buffer(b1) assert b1 is b2 with pytest.raises( ValueError, - match="buffer must either be exposed or spilled locked", + match="owning spillable buffer must either be exposed or spill locked", ): # Use `memory_info` to access device point _without_ making # the buffer unspillable. @@ -557,21 +558,21 @@ def test_as_buffer_of_spillable_buffer(manager: SpillManager): with acquire_spill_lock(): b3 = as_buffer(b1.get_ptr(mode="read"), size=b1.size, owner=b1) - assert isinstance(b3, SpillableBufferSlice) - assert b3.owner is b1 + assert isinstance(b3, SpillableBuffer) + assert b3.owner is b1.owner b4 = as_buffer( b1.get_ptr(mode="write") + data.itemsize, size=b1.size - data.itemsize, owner=b3, ) - assert isinstance(b4, SpillableBufferSlice) - assert b4.owner is b1 + assert isinstance(b4, SpillableBuffer) + assert b4.owner is b1.owner assert all(cupy.array(b4.memoryview()) == data[1:]) b5 = as_buffer(b4.get_ptr(mode="write"), size=b4.size - 1, owner=b4) - assert isinstance(b5, SpillableBufferSlice) - assert b5.owner is b1 + assert isinstance(b5, SpillableBuffer) + assert b5.owner is b1.owner assert all(cupy.array(b5.memoryview()) == data[1:-1]) From 726a7f30757d1a06d74d86bb82cf311cb159f7fd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Jan 2024 03:32:13 -1000 Subject: [PATCH 23/23] Pin pytest-cases<3.8.2 (#14756) Appears in the pytest-cases 3.8.2, there's a requirement that automatically finding `cases` must be in a file named `test_*`, when historically looks like we use `bench_*` https://github.com/smarie/python-pytest-cases/pull/320/files Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/14756 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 21b540e24ab..a5e3ea4c531 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -74,7 +74,7 @@ dependencies: - pydata-sphinx-theme!=0.14.2 - pytest - pytest-benchmark -- pytest-cases +- pytest-cases<3.8.2 - pytest-cov - pytest-xdist - python-confluent-kafka>=1.9.0,<1.10.0a0 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index c109dcca625..579bbb6d52d 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -71,7 +71,7 @@ dependencies: - pydata-sphinx-theme!=0.14.2 - pytest - pytest-benchmark -- pytest-cases +- pytest-cases<3.8.2 - pytest-cov - pytest-xdist - python-confluent-kafka>=1.9.0,<1.10.0a0 diff --git a/dependencies.yaml b/dependencies.yaml index 28b3afd7bbc..20998847a75 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -619,7 +619,7 @@ dependencies: - fastavro>=0.22.9 - hypothesis - pytest-benchmark - - pytest-cases + - pytest-cases<3.8.2 - python-snappy>=0.6.0 - scipy - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 7c3f4a97a5e..18771804f61 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -59,7 +59,7 @@ test = [ "msgpack", "pytest", "pytest-benchmark", - "pytest-cases", + "pytest-cases<3.8.2", "pytest-cov", "pytest-xdist", "python-snappy>=0.6.0",