diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7c0bd6d52e2..49ca5ca0fb9 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -52,6 +52,7 @@ jobs: OTEL_SERVICE_NAME: 'pr-cudf' steps: - name: Telemetry setup + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main changed-files: secrets: inherit @@ -329,7 +330,7 @@ jobs: telemetry-summarize: runs-on: ubuntu-latest needs: pr-builder - if: always() + if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - name: Load stashed telemetry env vars diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 3b972f31ca4..01dd2436beb 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 87c40421be0..33fc2f651c6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.8.5,<12.0a0 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit @@ -87,7 +87,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 0935de96d19..c290a83a37f 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.6.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 @@ -86,7 +86,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e52b8c5f2a0..2c16deeed82 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.8.5,<12.0a0 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.6.2,<13.0a0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c78ca326005..00020fdf6b8 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -31,9 +31,6 @@ fmt_version: flatbuffers_version: - "=24.3.25" -spdlog_version: - - ">=1.14.1,<1.15" - nvcomp_version: - "=4.1.0.6" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 1c2e9e8dd98..b585aafc397 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -68,7 +68,6 @@ requirements: - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} - flatbuffers {{ flatbuffers_version }} - - spdlog {{ spdlog_version }} - zlib {{ zlib_version }} outputs: diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 3d965f30986..08eab363af0 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.8.5,<12.0a0 {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.6.2,<13.0a0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 12e6826f301..2f17b57b0a4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -273,6 +273,14 @@ endif() # add third party dependencies using CPM rapids_cpm_init() + +# Not using rapids-cmake since we never want to find, always download. +CPMAddPackage( + NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG + c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 +) +rapids_make_logger(cudf EXPORT_SET cudf-exports) + # find jitify include(cmake/thirdparty/get_jitify.cmake) # find NVTX @@ -299,8 +307,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake) include(cmake/thirdparty/get_kvikio.cmake) # find fmt include(cmake/thirdparty/get_fmt.cmake) -# find spdlog -include(cmake/thirdparty/get_spdlog.cmake) # find nanoarrow include(cmake/thirdparty/get_nanoarrow.cmake) # find thread_pool @@ -772,7 +778,6 @@ add_library( src/utilities/default_stream.cpp src/utilities/host_memory.cpp src/utilities/linked_column.cpp - src/utilities/logger.cpp src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp @@ -910,11 +915,8 @@ if(CUDF_LARGE_STRINGS_DISABLED) target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) endif() -# Define RMM logging level -target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") - -# Define spdlog level -target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Define logging level +target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}") # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) @@ -928,14 +930,17 @@ if(TARGET CUDA::cuFile${_cufile_suffix}) target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND) endif() +# Remove this after upgrading to a CCCL that has a proper CMake option. See +# https://github.com/NVIDIA/cccl/pull/2844 +target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ - spdlog::spdlog_header_only + PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ cudf_logger PRIVATE $ cuco::cuco ZLIB::ZLIB @@ -944,6 +949,7 @@ target_link_libraries( $ nanoarrow rmm::rmm_logger_impl + cudf_logger_impl ) # Add Conda library, and include paths if specified diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45b46005c47..38a21961735 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index fa017ca9e29..267aa3a93f3 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list) cudf::size_type const num_rows = state.get_int64("NumRows"); auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); + auto const null_probability = state.get_float64("null_probability"); if (cardinality > num_rows) { state.skip("cardinality > num_rows"); @@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) data_profile profile = data_profile_builder() .cardinality(cardinality) - .null_probability(0.01) + .null_probability(null_probability) .distribution(cudf::type_to_id(), distribution_id::UNIFORM, static_cast(0), @@ -65,6 +66,7 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.01}) .add_string_axis("keep", {"any", "first", "last", "none"}) .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a80d0dcbdb8..8c86e8d4366 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 - ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) - : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); + auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width); }); } diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake deleted file mode 100644 index 90b0f4d8a8e..00000000000 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone speedlog -function(find_and_configure_spdlog) - - include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog( - FMT_OPTION "EXTERNAL_FMT_HO" - INSTALL_EXPORT_SET cudf-exports - BUILD_EXPORT_SET cudf-exports - ) - -endfunction() - -find_and_configure_spdlog() diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index 2f29578f7ae..d5cadce40c2 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,11 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff deleted file mode 100644 index 9f68d85e7db..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ /dev/null @@ -1,22 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 3d004aa55..71ce86bea 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -63,7 +63,7 @@ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1) \ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2) - --#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE) -+#if 0 - //! @brief Always dispatches to 64 bit offset version of an algorithm - # define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ -@@ -89,7 +89,7 @@ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ - _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments) - --#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) -+#elif 1 - - //! @brief Ensures that the size of the input does not overflow the offset type - # define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count) \ diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 1c1052487f2..5032a073b58 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable, in between the two settings will be excluded from the written log. The available levels are the same as for the CMake variable. * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime. -For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that +For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that are not errors or critical errors. This API should not be used within libcudf to manipulate logging, its purpose is to allow upstream users to configure libcudf logging to fit their application. By default, logging messages are output to stderr. Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the specified path (can be relative to the current directory). -Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to -standard output or even a custom spdlog sink. +Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to +standard output. # Data Types diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 4159e324472..9226697a7f6 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -16,300 +16,25 @@ #pragma once -#include -#include #include -#include #include #include -#include -#include -#include -#include #include #include #include #include -#include #include -#include -#include #include -#include #include #include -#include -#include #include #include -#include - namespace cudf { namespace detail { -// Compute the count of elements that pass the mask within each block -template -CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - int count = 0; - - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - count += __syncthreads_count(mask_true); - tid += block_size; - } - - if (threadIdx.x == 0) block_counts[blockIdx.x] = count; -} - -// Compute the exclusive prefix sum of each thread's mask value within each block -template -__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum) -{ - int offset = 0; - - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum); - - return offset; -} - -// This kernel scatters data and validity mask of a column based on the -// scan of the boolean mask. The block offsets for the scan are already computed. -// Just compute the scan of the mask in each block and add it to the block's -// output offset. This is the output index of each element. Scattering -// the valid mask is not as easy, because each thread is only responsible for -// one bit. Warp-level processing (ballot) makes this simpler. -// To make scattering efficient, we "coalesce" the block's scattered data and -// valids in shared memory, and then write from shared memory to global memory -// in a contiguous manner. -// The has_validity template parameter specializes this kernel for the -// non-nullable case for performance without writing another kernel. -// -// Note: `filter` is not run on indices larger than the input column size -template -__launch_bounds__(block_size) CUDF_KERNEL - void scatter_kernel(cudf::mutable_column_device_view output_view, - cudf::size_type* output_null_count, - cudf::column_device_view input_view, - cudf::size_type const* __restrict__ block_offsets, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - T* __restrict__ output_data = output_view.data(); - cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask(); - static_assert(block_size <= 1024, "Maximum thread block size exceeded"); - - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - cudf::size_type block_offset = block_offsets[blockIdx.x]; - - // one extra warp worth in case the block is not aligned - __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1]; - __shared__ T temp_data[block_size]; - - cudf::size_type warp_valid_counts{0}; // total valid sum over the `per_thread` loop below - cudf::size_type block_sum = 0; // count passing filter over the `per_thread` loop below - - // Note that since the maximum gridDim.x on all supported GPUs is as big as - // cudf::size_type, this loop is sufficient to cover our maximum column size - // regardless of the value of block_size and per_thread. - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - - cudf::size_type tmp_block_sum = 0; - // get output location using a scan of the mask result - cudf::size_type const local_index = block_scan_mask(mask_true, tmp_block_sum); - block_sum += tmp_block_sum; - - if (has_validity) { - temp_valids[threadIdx.x] = false; // init shared memory - if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false; - __syncthreads(); // wait for init - } - - if (mask_true) { - temp_data[local_index] = input_view.data()[tid]; // scatter data to shared - - // scatter validity mask to shared memory - if (has_validity and input_view.is_valid(tid)) { - // determine aligned offset for this warp's output - cudf::size_type const aligned_offset = block_offset % cudf::detail::warp_size; - temp_valids[local_index + aligned_offset] = true; - } - } - - __syncthreads(); // wait for shared data and validity mask to be complete - - // Copy output data coalesced from shared to global - if (threadIdx.x < tmp_block_sum) - output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x]; - - if (has_validity) { - // Since the valid bools are contiguous in shared memory now, we can use - // __popc to combine them into a single mask element. - // Then, most mask elements can be directly copied from shared to global - // memory. Only the first and last 32-bit mask elements of each block must - // use an atomicOr, because these are where other blocks may overlap. - - constexpr int num_warps = block_size / cudf::detail::warp_size; - // account for partial blocks with non-warp-aligned offsets - int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1; - int const last_warp = min(num_warps, last_index / cudf::detail::warp_size); - int const wid = threadIdx.x / cudf::detail::warp_size; - int const lane = threadIdx.x % cudf::detail::warp_size; - - cudf::size_type tmp_warp_valid_counts{0}; - - if (tmp_block_sum > 0 && wid <= last_warp) { - int valid_index = (block_offset / cudf::detail::warp_size) + wid; - - // compute the valid mask for this warp - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]); - - // Note the atomicOr's below assume that output_valid has been set to - // all zero before the kernel - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts = __popc(valid_warp); - if (wid > 0 && wid < last_warp) - output_valid[valid_index] = valid_warp; - else { - cuda::atomic_ref ref{ - output_valid[valid_index]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - - // if the block is full and not aligned then we have one more warp to cover - if ((wid == 0) && (last_warp == num_warps)) { - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]); - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts += __popc(valid_warp); - cuda::atomic_ref ref{ - output_valid[valid_index + num_warps]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - } - warp_valid_counts += tmp_warp_valid_counts; - } - - block_offset += tmp_block_sum; - tid += block_size; - } - // Compute total null_count for this block and add it to global count - constexpr cudf::size_type leader_lane{0}; - cudf::size_type block_valid_count = - cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); - - if (threadIdx.x == 0) { // one thread computes and adds to null count - cuda::atomic_ref ref{*output_null_count}; - ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed); - } -} - -template -struct DeviceType { - using type = T; -}; - -template -struct DeviceType()>> { - using type = typename T::rep; -}; - -template -struct DeviceType()>> { - using type = typename cudf::device_storage_type_t; -}; - -// Dispatch functor which performs the scatter for fixed column types and gather for other -template -struct scatter_gather_functor { - template ()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const* block_offsets, - Filter filter, - cudf::size_type per_thread, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto output_column = - cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr); - auto output = output_column->mutable_view(); - - bool has_valid = input.nullable(); - - using Type = typename DeviceType::type; - - auto scatter = (has_valid) ? scatter_kernel - : scatter_kernel; - - cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - - cudf::detail::device_scalar null_count{0, stream}; - if (output.nullable()) { - // Have to initialize the output mask to all zeros because we may update - // it with atomicOr(). - CUDF_CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), - 0, - cudf::bitmask_allocation_size_bytes(output.size()), - stream.value())); - } - - auto output_device_view = cudf::mutable_column_device_view::create(output, stream); - auto input_device_view = cudf::column_device_view::create(input, stream); - scatter<<>>(*output_device_view, - null_count.data(), - *input_device_view, - block_offsets, - input.size(), - per_thread, - filter); - - if (has_valid) { output_column->set_null_count(null_count.value(stream)); } - return output_column; - } - - template () and !cudf::is_fixed_point()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const*, - Filter filter, - cudf::size_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - rmm::device_uvector indices(output_size, stream); - - thrust::copy_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - indices.begin(), - filter); - - auto output_table = cudf::detail::gather(cudf::table_view{{input}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - - // There will be only one column - return std::make_unique(std::move(output_table->get_column(0))); - } -}; - /** * @brief Filters `input` using a Filter function object * @@ -319,9 +44,11 @@ struct scatter_gather_functor { * false otherwise. * * @tparam Filter the filter functor type - * @param[in] input The table_view to filter - * @param[in] filter A function object that takes an index and returns a bool - * @return unique_ptr The table generated from filtered `input`. + * @param input The table_view to filter + * @param filter A function object that takes an index and returns a bool + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for allocating the returned memory + * @return The table generated from filtered `input` */ template std::unique_ptr
copy_if(table_view const& input, @@ -333,76 +60,22 @@ std::unique_ptr
copy_if(table_view const& input, if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); } - constexpr int block_size = 256; - cudf::size_type per_thread = - elements_per_thread(compute_block_counts, input.num_rows(), block_size); - cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread}; - - // temp storage for block counts and offsets - rmm::device_uvector block_counts(grid.num_blocks, stream); - rmm::device_uvector block_offsets(grid.num_blocks + 1, stream); - - // 1. Find the count of elements in each block that "pass" the mask - compute_block_counts<<>>( - block_counts.begin(), input.num_rows(), per_thread, filter); - - // initialize just the first element of block_offsets to 0 since the InclusiveSum below - // starts at the second element. - CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); - - // 2. Find the offset for each block's output using a scan of block counts - if (grid.num_blocks > 1) { - // Determine and allocate temporary device storage - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(nullptr, - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - // Run exclusive prefix sum - cub::DeviceScan::InclusiveSum(d_temp_storage.data(), - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - } - - // As it is InclusiveSum, last value in block_offsets will be output_size - // unless num_blocks == 1, in which case output_size is just block_counts[0] - cudf::size_type output_size{0}; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &output_size, - grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), - sizeof(cudf::size_type), - cudaMemcpyDefault, - stream.value())); + auto indices = rmm::device_uvector(input.num_rows(), stream); + auto const begin = thrust::counting_iterator(0); + auto const end = begin + input.num_rows(); + auto const indices_end = + thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter); - stream.synchronize(); + auto const output_size = static_cast(thrust::distance(indices.begin(), indices_end)); - if (output_size == input.num_rows()) { - return std::make_unique
(input, stream, mr); - } else if (output_size > 0) { - std::vector> out_columns(input.num_columns()); - std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) { - return cudf::type_dispatcher(col_view.type(), - scatter_gather_functor{}, - col_view, - output_size, - block_offsets.begin(), - filter, - per_thread, - stream, - mr); - }); + // nothing selected + if (output_size == 0) { return empty_like(input); } + // everything selected + if (output_size == input.num_rows()) { return std::make_unique
(input, stream, mr); } - return std::make_unique
(std::move(out_columns)); - } else { - return empty_like(input); - } + auto const map = device_span(indices.data(), output_size); + return cudf::detail::gather( + input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr); } } // namespace detail diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 5dc75b1a3fb..a7efb4e6e93 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / cudf::detail::warp_size; - size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const warp_id = tidx / cudf::detail::warp_size; + auto const warps_per_grid = stride / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % cudf::detail::warp_size; + auto const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 5ea0d06039f..1bfb40e5916 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); - T result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDefault, stream.value())); - stream.synchronize(); - return result; + return cudf::detail::make_host_vector_sync( + device_span{col_view.data() + element_index, 1}, stream) + .front(); } } // namespace detail diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 46f424e051b..d16be5e22dd 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,7 +83,11 @@ struct DeviceSum { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support device operator identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support device operator identity"); +#endif return T{}; } }; @@ -141,7 +145,11 @@ struct DeviceMin { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMin identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity"); +#endif return cuda::std::numeric_limits::max(); } @@ -189,7 +197,11 @@ struct DeviceMax { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMax identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity"); +#endif return cuda::std::numeric_limits::lowest(); } @@ -225,7 +237,11 @@ struct DeviceProduct { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceProduct identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity"); +#endif return T{1, numeric::scale_type{0}}; } }; diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 8b709f2a8f8..957b6b70fe2 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant, I dividend, } // namespace detail /** - * Divides the left-hand-side by the right-hand-side, rounding up + * @brief Divides the left-hand-side by the right-hand-side, rounding up * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. * - * @param dividend the number to divide - * @param divisor the number of by which to divide - * @return The least integer multiple of {@link divisor} which is greater than or equal to - * the non-integral division dividend/divisor. + * The result is undefined if `divisor == 0` or + * if `divisor == -1` and `dividend == min()`. + * + * Will not overflow, and may _or may not_ be slower than the intuitive + * approach of using `(dividend + divisor - 1) / divisor`. * - * @note will not overflow, and may _or may not_ be slower than the intuitive - * approach of using (dividend + divisor - 1) / divisor + * @tparam I Integer type for `dividend`, `divisor`, and the return type + * @param dividend The number to divide + * @param divisor The number by which to divide + * @return The least integer multiple of `divisor` which is greater than or equal to + * the non-integral division `dividend/divisor` */ template constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp deleted file mode 100644 index e7643eb44bd..00000000000 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index bfe76d5690c..b561d0989e9 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -410,6 +410,7 @@ class parquet_reader_options_builder { * * @param val Boolean value whether to read matching projected and filter columns from mismatched * Parquet sources. + * * @return this for chaining. */ parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val) diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 16d532ea2b8..4f6238b5fe7 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -16,6 +16,8 @@ #pragma once #include +#include +#include #include #include #include @@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st // A buffer of CPU memory is allocated to hold the ColumnDeviceView // objects. Once filled, the CPU memory is then copied to device memory // and the pointer is set in the d_columns member. - std::vector h_buffer(padded_views_size_bytes); + auto h_buffer = cudf::detail::make_host_vector(padded_views_size_bytes, stream); // Each ColumnDeviceView instance may have child objects which may // require setting some internal device pointers before being copied // from CPU to device. @@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st auto d_columns = detail::child_columns_to_device_array( source_view.begin(), source_view.end(), h_ptr, d_ptr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + auto const h_span = host_span{h_buffer}.subspan( + static_cast(h_ptr) - h_buffer.data(), views_size_bytes); + auto const d_span = device_span{static_cast(d_ptr), views_size_bytes}; + cudf::detail::cuda_memcpy(d_span, h_span, stream); return std::make_tuple(std::move(descendant_storage), d_columns); } diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp deleted file mode 100644 index 982554a23f5..00000000000 --- a/cpp/include/cudf/utilities/logger.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace CUDF_EXPORT cudf { - -namespace detail { -spdlog::logger& logger(); -} - -/** - * @brief Returns the global logger. - * - * This is a global instance of a spdlog logger. It can be used to configure logging behavior in - * libcudf. - * - * Examples: - * @code{.cpp} - * // Turn off logging at runtime - * cudf::logger().set_level(spdlog::level::off); - * // Add a stdout sink to the logger - * cudf::logger().sinks().push_back(std::make_shared()); - * // Replace the default sink - * cudf::logger().sinks() ={std::make_shared()}; - * @endcode - * - * Note: Changes to the sinks are not thread safe and should only be done during global - * initialization. - * - * @return spdlog::logger& The logger. - */ -[[deprecated( - "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& -logger(); - -} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 21ee4fa9e9b..2273a89892b 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -417,7 +417,9 @@ class base_2dspan { constexpr base_2dspan(RowType flat_view, size_t columns) : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns} { +#ifndef __CUDA_ARCH__ CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size"); +#endif } /** diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index b2c1a23f57e..f0d5d9ecb5d 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext { * @file */ -/** - * @brief Returns the minhash value for each string - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. - * - * Any null row entries result in corresponding null output rows. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if the width < 2 - * - * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return Minhash values for each string in input - */ -[[deprecated]] std::unique_ptr minhash( - cudf::strings_column_view const& input, - cudf::numeric_scalar seed = 0, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns the minhash values for each string per seed - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string for each seed. - * Each row of the list column are seed results for the corresponding - * string. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - to be replaced in a future release - * - * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Strings column to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed - */ -[[deprecated]] std::unique_ptr minhash( - cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - /** * @brief Returns the minhash values for each string * @@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash_permuted( +std::unique_ptr minhash( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, @@ -142,67 +79,16 @@ std::unique_ptr minhash_permuted( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Returns the minhash value for each string - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. - * - * Any null row entries result in corresponding null output rows. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm. - * The hash function returns 2 uint64 values but only the first value - * is used with the minhash calculation. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if the width < 2 - * - * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return Minhash values as UINT64 for each string in input - */ -[[deprecated]] std::unique_ptr minhash64( - cudf::strings_column_view const& input, - cudf::numeric_scalar seed = 0, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns the minhash values for each string per seed - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string for each seed. - * Each row of the list column are seed results for the corresponding - * string. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm. + * @copydoc nvtext::minhash * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - to be replaced in a future release - * - * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Strings column to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed + * @deprecated Use nvtext::minhash() */ -[[deprecated]] std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width = 4, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -244,7 +130,7 @@ std::unique_ptr minhash_permuted( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64_permuted( +std::unique_ptr minhash64( cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -254,64 +140,18 @@ std::unique_ptr minhash64_permuted( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Returns the minhash values for each row of strings per seed - * - * Hash values are computed from each string in each row and the - * minimum hash value is returned for each row for each seed. - * Each row of the output list column are seed results for the corresponding - * input row. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * Any null row entries result in corresponding null output rows. + * @copydoc nvtext::minhash64 * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Lists column of strings to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed + * @deprecated Use nvtext::minhash64() */ -[[deprecated]] std::unique_ptr word_minhash( - cudf::lists_column_view const& input, - cudf::device_span seeds, +[[deprecated]] std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); -/** - * @brief Returns the minhash values for each row of strings per seed - * - * Hash values are computed from each string in each row and the - * minimum hash value is returned for each row for each seed. - * Each row of the output list column are seed results for the corresponding - * input row. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm though - * only the first 64-bits of the hash are used in computing the output. - * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Lists column of strings to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed - */ -[[deprecated]] std::unique_ptr word_minhash64( - cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 59c8453cf33..4715931a7a9 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 9d3cf75a13f..d45c02f374f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -18,8 +18,8 @@ #include "nvcomp_adapter.cuh" -#include #include +#include #include #include diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 6c84b53db46..7f0b5e07b09 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -28,13 +28,13 @@ #include "io/utilities/parsing_utils.cuh" #include -#include #include #include #include #include #include #include +#include #include #include #include diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 7fafa885c66..7b9fc25d1cc 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -222,18 +222,19 @@ struct json_column_data { using hashmap_of_device_columns = std::unordered_map>; -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, cudf::detail::host_vector, hashmap_of_device_columns> + build_tree(device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); void scatter_offsets(tree_meta_t const& tree, device_span col_ids, @@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree, device_span sorted_col_ids, // Reuse this for parent_col_ids tree_meta_t const& d_column_tree, host_span ignore_vals, + host_span is_mixed, hashmap_of_device_columns const& columns, rmm::cuda_stream_view stream); @@ -363,17 +365,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto const [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); if (ignore_vals.empty()) return; scatter_offsets(tree, col_ids, @@ -382,22 +384,24 @@ void make_device_json_column(device_span input, sorted_col_ids, d_column_tree, ignore_vals, + is_mixed_pruned, columns, stream); } -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, cudf::detail::host_vector, hashmap_of_device_columns> + build_tree(device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); @@ -488,7 +492,9 @@ std::pair, hashmap_of_device_columns> build_tree // NoPruning: iterate through schema and enforce type. if (adj[parent_node_sentinel].empty()) - return {cudf::detail::make_host_vector(0, stream), {}}; // for empty file + return {cudf::detail::make_host_vector(0, stream), + cudf::detail::make_host_vector(0, stream), + {}}; // for empty file CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1"); auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); @@ -551,11 +557,14 @@ std::pair, hashmap_of_device_columns> build_tree auto list_child = schema.child_types.at(this_list_child_name); for (auto const& child_id : child_ids) mark_is_pruned(child_id, list_child); + // TODO: Store null map of non-target types for list children to mark list entry as null. } }; if (is_array_of_arrays) { if (adj[adj[parent_node_sentinel][0]].empty()) - return {cudf::detail::make_host_vector(0, stream), {}}; + return {cudf::detail::make_host_vector(0, stream), + cudf::detail::make_host_vector(0, stream), + {}}; auto root_list_col_id = is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0]; // mark root and row array col_id as not pruned. @@ -647,8 +656,12 @@ std::pair, hashmap_of_device_columns> build_tree ? adj[parent_node_sentinel][0] : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + // List children which are pruned mixed types, nullify parent list row. + auto is_mixed_pruned = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(is_mixed_pruned.begin(), num_columns, false); auto handle_mixed_types = [&column_categories, &is_str_column_all_nulls, + &is_mixed_pruned, &is_pruned, &expected_types, &is_enabled_mixed_types_as_string, @@ -794,6 +807,14 @@ std::pair, hashmap_of_device_columns> build_tree "list child column insertion failed, duplicate column name in the parent"); ref.get().column_order.emplace_back(list_child_name); auto this_ref = std::ref(ref.get().child_columns.at(list_child_name)); + if (options.is_enabled_experimental()) { + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) { + // store this child_id for mixed_type nullify parent list_id. + is_mixed_pruned[child_id] = is_pruned[child_id]; + } + } + } // Mixed type handling handle_mixed_types(child_ids); if (child_ids.empty()) { @@ -829,7 +850,7 @@ std::pair, hashmap_of_device_columns> build_tree [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, expected_types, stream); - return {is_pruned, columns}; + return {is_pruned, is_mixed_pruned, columns}; } void scatter_offsets(tree_meta_t const& tree, @@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree, device_span sorted_col_ids, // Reuse this for parent_col_ids tree_meta_t const& d_column_tree, host_span ignore_vals, + host_span is_mixed_pruned, hashmap_of_device_columns const& columns, rmm::cuda_stream_view stream) { @@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree, auto d_ignore_vals = cudf::detail::make_device_uvector_async( ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async( + is_mixed_pruned, stream, cudf::get_current_device_resource_ref()); auto d_columns_data = cudf::detail::make_device_uvector_async( columns_data, stream, cudf::get_current_device_resource_ref()); @@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree, column_categories[col_ids[parent_node_id]] == NC_LIST and (!d_ignore_vals[col_ids[parent_node_id]]); }); + // For children of list and in ignore_vals, find it's parent node id, and set corresponding + // parent's null mask to null. Setting mixed type list rows to null. + auto const num_list_children = thrust::distance( + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + d_is_mixed_pruned = d_is_mixed_pruned.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return; + if (column_categories[col_ids[parent_node_id]] == NC_LIST and + d_is_mixed_pruned[col_ids[node_id]]) { + clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]); + } + }); - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 30a154fdda2..1fe58a0449f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -464,46 +464,49 @@ std::pair, std::vector> device_json_co column_names.emplace_back( json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); - // Note: json_col modified here, reuse the memory + // If child is not present, set the null mask correctly, but offsets are zero, and children + // are empty. Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release(), rmm::device_buffer{}, 0); // Create children column - auto child_schema_element = - json_col.child_columns.empty() ? std::optional{} : get_list_child_schema(); - auto [child_column, names] = - json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) - ? std::pair, - // EMPTY type could not used because gather throws exception on EMPTY type. - std::vector>{std::make_unique( - data_type{type_id::INT8}, - 0, - rmm::device_buffer{}, - rmm::device_buffer{}, - 0), - std::vector{}} - : device_json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - prune_columns, - child_schema_element, - stream, - mr); + auto child_schema_element = get_list_child_schema(); + auto [child_column, names] = [&]() { + if (json_col.child_columns.empty()) { + // EMPTY type could not used because gather throws exception on EMPTY type. + auto empty_col = make_empty_column( + child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr); + auto children_metadata = std::vector{ + make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), + list_child_name) + .children}; + + return std::pair, std::vector>{ + std::move(empty_col), children_metadata}; + } + return device_json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + prune_columns, + child_schema_element, + stream, + mr); + }(); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); - auto ret_col = make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - 0, - rmm::device_buffer{0, stream, mr}, - stream, - mr); - // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and - // null validation applied in make_lists_column factory, which is not needed for json - // parent column cannot be null when its children is non-empty in JSON - if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); } + auto ret_col = make_lists_column( + num_rows, + std::move(offsets_column), + std::move(child_column), + null_count, + null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask), + stream, + mr); + // Since some rows in child column may need to be nullified due to mixed types, we can not + // skip the purge_nonempty_nulls call in make_lists_column factory return {std::move(ret_col), std::move(column_names)}; } default: CUDF_FAIL("Unsupported column type"); break; diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 4989fff4b30..cc5f256ea80 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create empty column of a given nested schema + * + * @param schema The schema of the column to create + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The empty column + */ +std::unique_ptr make_empty_column(schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @brief Create all null column of a given nested schema * @@ -452,17 +464,6 @@ std::unique_ptr make_all_nulls_column(schema_element const& schema, */ column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the column - * @param options json reader options which holds schema - * @return data type of the column if present - */ -std::optional get_path_data_type( - host_span const> path, - cudf::io::json_reader_options const& options); - /** * @brief Helper class to get path of a column by column id from reduced column tree * diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index ced7acb9cde..4b4827ca8d9 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -68,78 +68,6 @@ void json_reader_options::set_dtypes(schema_element types) } // namespace cudf::io namespace cudf::io::json::detail { -namespace { - -// example schema and its path. -// "a": int {"a", int} -// "a": [ int ] {"a", list}, {"element", int} -// "a": { "b": int} {"a", struct}, {"b", int} -// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} -// "a": [ null] {"a", list}, {"element", str} -// back() is root. -// front() is leaf. -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the json column - * @param root root of input schema element - * @return data type of the column if present, otherwise std::nullopt - */ -std::optional get_path_data_type( - host_span const> path, schema_element const& root) -{ - if (path.empty() || path.size() == 1) { - return root.type; - } else { - if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { - auto const child_name = path.first(path.size() - 1).back().first; - auto const child_schema_it = root.child_types.find(child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { - auto const child_schema_it = root.child_types.find(list_child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } - return std::optional{}; - } -} - -std::optional child_schema_element(std::string const& col_name, - cudf::io::json_reader_options const& options) -{ - return std::visit( - cudf::detail::visitor_overload{ - [col_name](std::vector const& user_dtypes) -> std::optional { - auto column_index = atol(col_name.data()); - return (static_cast(column_index) < user_dtypes.size()) - ? std::optional{{user_dtypes[column_index]}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; - }, - [col_name](schema_element const& user_dtypes) -> std::optional { - return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) - ? user_dtypes.child_types.find(col_name)->second - : std::optional{}; - }}, - options.get_dtypes()); -} - -} // namespace - /// Created an empty column of the specified schema struct empty_column_functor { rmm::cuda_stream_view stream; @@ -159,7 +87,17 @@ struct empty_column_functor { std::unique_ptr child = cudf::type_dispatcher( schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); auto offsets = make_empty_column(data_type(type_to_id())); - return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + std::vector> child_columns; + child_columns.push_back(std::move(offsets)); + child_columns.push_back(std::move(child)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on + // the child column as it does not have non-empty nulls. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::LIST}, + 0, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0, + std::move(child_columns)); } template )> @@ -174,6 +112,13 @@ struct empty_column_functor { } }; +std::unique_ptr make_empty_column(schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, empty_column_functor{stream, mr}, schema); +} + /// Created all null column of the specified schema struct allnull_column_functor { rmm::cuda_stream_view stream; @@ -198,10 +143,9 @@ struct allnull_column_functor { std::unique_ptr operator()(schema_element const& schema, size_type size) const { CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); - auto const& child_name = schema.child_types.begin()->first; - std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, - empty_column_functor{stream, mr}, - schema.child_types.at(child_name)); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = + make_empty_column(schema.child_types.at(child_name), stream, mr); return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); auto indices = make_zeroed_offsets(size - 1); auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); @@ -221,14 +165,22 @@ struct allnull_column_functor { std::unique_ptr operator()(schema_element const& schema, size_type size) const { CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); - auto const& child_name = schema.child_types.begin()->first; - std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, - empty_column_functor{stream, mr}, - schema.child_types.at(child_name)); - auto offsets = make_zeroed_offsets(size); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = + make_empty_column(schema.child_types.at(child_name), stream, mr); + auto offsets = make_zeroed_offsets(size); auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); - return make_lists_column( - size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + std::vector> child_columns; + child_columns.push_back(std::move(offsets)); + child_columns.push_back(std::move(child)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on + // the child column as it does not have non-empty nulls. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::LIST}, + size, + rmm::device_buffer{}, + std::move(null_mask), + size, + std::move(child_columns)); } template )> @@ -240,8 +192,14 @@ struct allnull_column_functor { schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); } auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); - return make_structs_column( - size, std::move(child_columns), size, std::move(null_mask), stream, mr); + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` on + // the children columns. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::STRUCT}, + size, + rmm::device_buffer{}, + std::move(null_mask), + size, + std::move(child_columns)); } }; @@ -281,48 +239,4 @@ column_name_info make_column_name_info(schema_element const& schema, std::string } return info; } - -std::optional get_path_data_type( - host_span const> path, - cudf::io::json_reader_options const& options) -{ - if (path.empty()) return {}; - std::optional col_schema = child_schema_element(path.back().first, options); - // check if it has value, then do recursive call and return. - if (col_schema.has_value()) { - return get_path_data_type(path, col_schema.value()); - } else { - return {}; - } -} - -// idea: write a memoizer using template and lambda?, then call recursively. -std::vector path_from_tree::get_path(NodeIndexT this_col_id) -{ - std::vector path; - // stops at root. - while (this_col_id != parent_node_sentinel) { - auto type = column_categories[this_col_id]; - std::string name = ""; - // code same as name_and_parent_index lambda. - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } - // "name": type/schema - path.emplace_back(name, type); - this_col_id = parent_col_id; - if (this_col_id == row_array_parent_col_id) return path; - } - return {}; -} - } // namespace cudf::io::json::detail diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index fcaee9c548e..726c79bd004 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index ed0b6969154..07172b6b7f7 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -23,10 +23,10 @@ #include #include #include -#include #include #include #include +#include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index d432deb8e79..8e532b01788 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -28,10 +28,11 @@ #include #include +#include #include -#include #include #include +#include #include #include #include @@ -506,7 +507,7 @@ size_t max_varint_size() return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7); } -constexpr size_t RLE_stream_size(TypeKind kind, size_t count) +size_t RLE_stream_size(TypeKind kind, size_t count) { using cudf::util::div_rounding_up_unsafe; constexpr auto byte_rle_max_len = 128; @@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, // we know the size of each array. The number of stripes per column in a chunk array can // be calculated by dividing the number of chunks by the number of columns. // That many chunks need to be copied at a time to the proper destination. - size_t num_entries_seen = 0; + size_t num_entries_seen = 0; + auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2; + auto h_srcs = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_dsts = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_lens = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) { auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns; - auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk); - auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group); for (size_t col = 0; col < num_columns; ++col) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col, - chunk_bytes, - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY( - cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col, - merge_bytes, - cudaMemcpyDefault, - stream.value())); + h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col); + h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_chunk)); + + h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col); + h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group)); } num_entries_seen += stripes_per_col; } + auto const& mr = cudf::get_current_device_resource_ref(); + auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr); + auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr); + auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr); + cudf::detail::batched_memcpy_async( + d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream); + auto file_stats_merge = cudf::detail::make_host_vector(num_file_blobs, stream); for (auto i = 0u; i < num_file_blobs; ++i) { diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index bfd0cc992cf..0dd1aff41e9 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -23,7 +23,7 @@ #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" -#include +#include #include #include diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f865c9a7643..188e6a8c0d8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -38,10 +38,10 @@ #include #include #include -#include #include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp index 2a2a07afc8d..00fc54f9883 100644 --- a/cpp/src/io/utilities/base64_utilities.cpp +++ b/cpp/src/io/utilities/base64_utilities.cpp @@ -60,7 +60,7 @@ #include "base64_utilities.hpp" -#include +#include #include diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index bed03869b34..dfa5d46cf48 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -16,9 +16,9 @@ #include "file_io_utilities.hpp" -#include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 62ef7c7a794..38dedcc2627 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -17,11 +17,11 @@ #include "file_io_utilities.hpp" #include "getenv_or.hpp" -#include #include #include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 9b17e7f6d55..28367c95430 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -19,10 +19,11 @@ #include "getenv_or.hpp" #include -#include #include +#include #include +#include #include #include diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp index 3fd97a00b61..b9613428418 100644 --- a/cpp/src/io/utilities/getenv_or.hpp +++ b/cpp/src/io/utilities/getenv_or.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh index 687766c1bcc..ff1b9f58e6c 100644 --- a/cpp/src/io/utilities/time_utils.cuh +++ b/cpp/src/io/utilities/time_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ static const __device__ __constant__ int32_t powers_of_ten[10] = { struct get_period { template - constexpr int32_t operator()() + int32_t operator()() { if constexpr (is_chrono()) { return T::period::den; } CUDF_FAIL("Invalid, non chrono type"); @@ -42,7 +42,7 @@ struct get_period { /** * @brief Function that translates cuDF time unit to clock frequency */ -constexpr int32_t to_clockrate(type_id timestamp_type_id) +inline int32_t to_clockrate(type_id timestamp_type_id) { return timestamp_type_id == type_id::EMPTY ? 0 diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index ebab3beb08f..d6b85db3f0f 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, auto const stride = cudf::detail::grid_1d::grid_stride(); // Initialize local histogram - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_sizes[partition_number] = 0; partition_number += blockDim.x; @@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit extern __shared__ size_type shared_partition_offsets[]; // Initialize array of this blocks offsets from global array - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_offsets[partition_number] = block_partition_offsets[partition_number * gridDim.x + blockIdx.x]; @@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter, // Fetch the offset in the output buffer of each partition in this thread // block - for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) { + for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions; + ipartition += blockDim.x) { partition_offset_global[ipartition] = scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x]; } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d27420658d6..2128bacff80 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, size_type const* group_cluster_offsets, bool has_nulls) { - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const group_index = tid; if (group_index >= num_groups) { return; } diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 7d11b02d3e1..9ab8ed5938a 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -95,8 +95,8 @@ rmm::device_uvector distinct_indices(table_view const& input, auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const helper_func = [&](auto const& d_equal) { - using RowHasher = std::decay_t; - auto set = hash_set_type{ + using RowEqual = std::decay_t; + auto set = distinct_set_t{ num_rows, 0.5, // desired load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index c3a004b7f28..aadb438b019 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -21,8 +21,8 @@ namespace cudf::detail { -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, @@ -100,7 +100,7 @@ rmm::device_uvector reduce_by_row(hash_set_type& set, } template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -110,7 +110,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -120,7 +120,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -130,7 +130,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp index f15807c2434..4ca1cab937a 100644 --- a/cpp/src/stream_compaction/distinct_helpers.hpp +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) } } -template -using hash_set_type = +template +using distinct_set_t = cuco::static_set, cuda::thread_scope_device, - RowHasher, + RowEqual, cuco::linear_probing<1, cudf::experimental::row::hash::device_row_hasher< cudf::hashing::detail::default_hash, @@ -79,6 +79,8 @@ using hash_set_type = * the `reduction_init_value()` function. Then, the reduction result for each row group is written * into the output array at the index of an unspecified row in the group. * + * @tparam RowEqual The type of row equality comparator + * * @param set The auxiliary set to perform reduction * @param set_size The number of elements in set * @param num_rows The number of all input rows @@ -87,8 +89,8 @@ using hash_set_type = * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the output indices */ -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 0f33fcb6fe1..94bc81ec933 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -347,13 +348,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, string_view const d_target, bool* d_results) { - auto const idx = cudf::detail::grid_1d::global_thread_id(); - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; + auto const idx = cudf::detail::grid_1d::global_thread_id(); auto const str_idx = idx / cudf::detail::warp_size; if (str_idx >= d_strings.size()) { return; } - auto const lane_idx = idx % cudf::detail::warp_size; + + namespace cg = cooperative_groups; + auto const warp = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = warp.thread_rank(); + if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); @@ -373,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, } } - auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); + auto const result = warp.any(found); if (lane_idx == 0) { d_results[str_idx] = result; } } diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index b7a719a2041..9a44d9477ab 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -52,118 +52,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Compute the minhash of each string for each seed - * - * This is a warp-per-string algorithm where parallel threads within a warp - * work on substrings of a single string row. - * - * @tparam HashFunction hash function to use on each substring - * - * @param d_strings Strings column to process - * @param seeds Seeds for hashing each string - * @param width Substring window size in characters - * @param d_hashes Minhash output values for each string - */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, - cudf::device_span seeds, - cudf::size_type width, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - - auto const str_idx = static_cast(idx / cudf::detail::warp_size); - if (str_idx >= d_strings.size()) { return; } - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - - if (d_strings.is_null(str_idx)) { return; } - - auto const d_str = d_strings.element(str_idx); - auto const d_output = d_hashes + (str_idx * seeds.size()); - - // initialize hashes output for this string - if (lane_idx == 0) { - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - auto const begin = d_str.data() + lane_idx; - auto const end = d_str.data() + d_str.size_bytes(); - - // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } - auto const check_str = // used for counting 'width' characters - cudf::string_view(itr, static_cast(thrust::distance(itr, end))); - auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string - - auto const hash_str = cudf::string_view(itr, bytes); - // hashing with each seed on the same section of the string is 10x faster than - // computing the substrings for each seed - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = HashFunction(seeds[seed_idx]); - // hash substring and store the min value - if constexpr (std::is_same_v) { - auto const hvalue = hasher(hash_str); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); - } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - auto const hvalue = thrust::get<0>(hasher(hash_str)); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); - } - } - } -} - -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr minhash_fn(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS(width >= 2, - "Parameter width should be an integer value of 2 or greater", - std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); - - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } - - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); - - constexpr cudf::thread_index_type block_size = 256; - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - minhash_kernel<<>>( - *d_strings, seeds, width, d_hashes); - - return hashes; -} - constexpr cudf::thread_index_type block_size = 256; // for potentially tuning minhash_seed_kernel independently from block_size constexpr cudf::thread_index_type tile_size = block_size; @@ -297,13 +185,13 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, * @param d_results Final results vector of calculate values */ template -CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, - cudf::device_span indices, - cudf::device_span parameter_a, - cudf::device_span parameter_b, - cudf::size_type width, - hash_value_type const* d_hashes, - hash_value_type* d_results) +CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) { auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const idx = (tid / blocks_per_string) / block_size; @@ -478,7 +366,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_indices = cudf::device_span(indices.data(), threshold_index); cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, block_size}; - minhash_permuted_kernel + minhash_kernel <<>>( *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } @@ -489,7 +377,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_indices = cudf::device_span(indices.data() + threshold_index, count); cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; - minhash_permuted_kernel + minhash_kernel <<>>( *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } @@ -497,101 +385,6 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return results; } -/** - * @brief Compute the minhash of each list row of strings for each seed - * - * This is a warp-per-row algorithm where parallel threads within a warp - * work on strings in a single list row. - * - * @tparam HashFunction hash function to use on each string - * - * @param d_input List of strings to process - * @param seeds Seeds for hashing each string - * @param d_hashes Minhash output values (one per row) - */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, - cudf::device_span seeds, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const row_idx = idx / cudf::detail::warp_size; - - if (row_idx >= d_input.size()) { return; } - if (d_input.is_null(row_idx)) { return; } - - auto const d_row = cudf::list_device_view(d_input, row_idx); - auto const d_output = d_hashes + (row_idx * seeds.size()); - - // initialize hashes output for this row - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - if (lane_idx == 0) { - auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - // each lane hashes a string from the input row - for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { - auto const hash_str = - d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = HashFunction(seeds[seed_idx]); - // hash string and store the min value - hash_value_type hv; - if constexpr (std::is_same_v) { - hv = hasher(hash_str); - } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - hv = thrust::get<0>(hasher(hash_str)); - } - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hv, cuda::std::memory_order_relaxed); - } - } -} - -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); - - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } - - auto const d_input = cudf::column_device_view::create(input.parent(), stream); - - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); - auto lcdv = cudf::detail::lists_column_device_view(*d_input); - - constexpr cudf::thread_index_type block_size = 256; - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - minhash_word_kernel - <<>>(lcdv, seeds, d_hashes); - - return hashes; -} - std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, @@ -620,30 +413,6 @@ std::unique_ptr build_list_result(cudf::column_view const& input, } } // namespace -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar const& seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto const seeds = cudf::device_span{seed.data(), 1}; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); - return hashes; -} - -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - std::unique_ptr minhash(cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, @@ -658,30 +427,6 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::numeric_scalar const& seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto const seeds = cudf::device_span{seed.data(), 1}; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); - return hashes; -} - -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -696,45 +441,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } -std::unique_ptr word_minhash(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - -std::unique_ptr word_minhash64(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::minhash(input, seed, width, stream, mr); -} - -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, width, stream, mr); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, @@ -750,23 +468,15 @@ std::unique_ptr minhash_permuted(cudf::strings_column_view const& } std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::minhash64(input, seed, width, stream, mr); -} - -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::device_span seeds, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash64(input, seeds, width, stream, mr); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, @@ -781,21 +491,4 @@ std::unique_ptr minhash64_permuted(cudf::strings_column_view const return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } -std::unique_ptr word_minhash(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::word_minhash(input, seeds, stream, mr); -} - -std::unique_ptr word_minhash64(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::word_minhash64(input, seeds, stream, mr); -} } // namespace nvtext diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 4fd0369c26b..9d96c11c3f2 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data { // cannot use global_thread_id utility due to a JIT build issue by including // the `cudf/detail/utilities/cuda.cuh` header - thread_index_type const start = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto const block_size = static_cast(blockDim.x); + thread_index_type const start = threadIdx.x + blockIdx.x * block_size; + thread_index_type const stride = block_size * gridDim.x; for (auto i = start; i < static_cast(size); i += stride) { GENERIC_UNARY_OP(&out_data[i], in_data[i]); diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 66bbe532e46..39c11295fbd 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span col size_type max_branch_depth) { extern __shared__ row_span thread_branch_stacks[]; - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = static_cast(cudf::detail::grid_1d::global_thread_id()); auto const num_segments = static_cast(output.size()); if (tid >= num_segments) { return; } diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index e30806a5011..4196523d211 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include #include diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp deleted file mode 100644 index e52fffbd8c6..00000000000 --- a/cpp/src/utilities/logger.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include - -#include - -namespace { - -/** - * @brief Creates a sink for libcudf logging. - * - * Returns a file sink if the file name has been specified, otherwise returns a stderr sink. - */ -[[nodiscard]] spdlog::sink_ptr make_libcudf_sink() -{ - if (auto filename = std::getenv("LIBCUDF_DEBUG_LOG_FILE"); filename != nullptr) { - return std::make_shared(filename, true); - } else { - return std::make_shared(); - } -} - -/** - * @brief Converts the level name into the `spdlog` level enum. - */ -[[nodiscard]] spdlog::level::level_enum libcudf_log_level() -{ - auto const env_level = std::getenv("LIBCUDF_LOGGING_LEVEL"); - if (env_level == nullptr) { return spdlog::level::warn; } - - auto const env_lvl_str = std::string(env_level); - if (env_lvl_str == "TRACE") return spdlog::level::trace; - if (env_lvl_str == "DEBUG") return spdlog::level::debug; - if (env_lvl_str == "INFO") return spdlog::level::info; - if (env_lvl_str == "WARN") return spdlog::level::warn; - if (env_lvl_str == "ERROR") return spdlog::level::err; - if (env_lvl_str == "CRITICAL") return spdlog::level::critical; - if (env_lvl_str == "OFF") return spdlog::level::off; - - CUDF_FAIL("Invalid value for LIBCUDF_LOGGING_LEVEL environment variable"); -} - -/** - * @brief Simple wrapper around a spdlog::logger that performs cuDF-specific initialization. - */ -struct logger_wrapper { - spdlog::logger logger_; - - logger_wrapper() : logger_{"CUDF", make_libcudf_sink()} - { - logger_.set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v"); - logger_.set_level(libcudf_log_level()); - logger_.flush_on(spdlog::level::warn); - } -}; - -} // namespace - -spdlog::logger& cudf::detail::logger() -{ - static logger_wrapper wrapped{}; - return wrapped.logger_; -} - -spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d1bebd1937..b0f2d8c0637 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu index e95c9fb41c6..9f8d22ea94d 100644 --- a/cpp/tests/bitmask/set_nullmask_tests.cu +++ b/cpp/tests/bitmask/set_nullmask_tests.cu @@ -31,6 +31,7 @@ #include #include +namespace { struct valid_bit_functor { cudf::bitmask_type const* _null_mask; __device__ bool operator()(cudf::size_type element_index) const noexcept @@ -38,13 +39,7 @@ struct valid_bit_functor { return cudf::bit_is_set(_null_mask, element_index); } }; - -std::ostream& operator<<(std::ostream& stream, thrust::host_vector const& bits) -{ - for (auto _bit : bits) - stream << int(_bit); - return stream; -} +} // namespace struct SetBitmaskTest : public cudf::test::BaseFixture { void expect_bitmask_equal(cudf::bitmask_type const* bitmask, // Device Ptr diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu index 96f122f21a8..8ffcc552ecb 100644 --- a/cpp/tests/bitmask/valid_if_tests.cu +++ b/cpp/tests/bitmask/valid_if_tests.cu @@ -28,6 +28,7 @@ struct ValidIfTest : public cudf::test::BaseFixture {}; +namespace { struct odds_valid { __host__ __device__ bool operator()(cudf::size_type i) { return i % 2; } }; @@ -37,6 +38,7 @@ struct all_valid { struct all_null { __host__ __device__ bool operator()(cudf::size_type i) { return false; } }; +} // namespace TEST_F(ValidIfTest, EmptyRange) { diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp index 5570a7d498c..1f29ea9e5fc 100644 --- a/cpp/tests/column/bit_cast_test.cpp +++ b/cpp/tests/column/bit_cast_test.cpp @@ -25,6 +25,7 @@ #include +namespace { template struct rep_type_impl { using type = void; @@ -47,12 +48,14 @@ struct rep_type_impl()>> { template using rep_type_t = typename rep_type_impl::type; +} // namespace template struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes); +namespace { template void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end) { @@ -102,6 +105,7 @@ void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator } } } +} // namespace TYPED_TEST(ColumnViewAllTypesTests, BitCast) { diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu index d7e93fb22a3..fff3282fdd5 100644 --- a/cpp/tests/column/compound_test.cu +++ b/cpp/tests/column/compound_test.cu @@ -34,6 +34,7 @@ struct CompoundColumnTest : public cudf::test::BaseFixture {}; +namespace { template struct checker_for_level1 { ColumnDeviceView d_column; @@ -62,6 +63,7 @@ struct checker_for_level2 { return bcheck; } }; +} // namespace TEST_F(CompoundColumnTest, ChildrenLevel1) { diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index b81f8196d89..2fb24f6b31e 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -31,6 +31,7 @@ #include +namespace { template CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) { @@ -109,6 +110,7 @@ std::enable_if_t(), T> accumulate(cudf::host_span xs.begin(), xs.end(), ys.begin(), [](T const& ts) { return ts.time_since_epoch().count(); }); return T{typename T::duration{std::accumulate(ys.begin(), ys.end(), 0)}}; } +} // namespace template struct AtomicsTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index b96c6909e55..f8f8d525043 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -577,10 +577,12 @@ TEST_F(FixedPointTest, Decimal32FloatVector) float_vector_test(0.15, 20, -2, std::multiplies<>()); } +namespace { struct cast_to_int32_fn { using decimal32 = fixed_point; int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast(fp); } }; +} // namespace TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu index f34760341d8..ddc48c97012 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cu +++ b/cpp/tests/fixed_point/fixed_point_tests.cu @@ -72,10 +72,12 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust) EXPECT_EQ(vec2, vec3); } +namespace { struct cast_to_int32_fn { using decimal32 = fixed_point; int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast(fp); } }; +} // namespace TEST_F(FixedPointTest, DecimalXXThrustOnDevice) { diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 4ae5d06b214..883a5093bd1 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -30,6 +30,7 @@ #include #include +namespace { /** * @brief Functor to generate a tdigest by key. * @@ -116,6 +117,7 @@ struct tdigest_groupby_simple_merge_op { return std::move(result.second[0].results[0]); } }; +} // namespace template struct TDigestAllTypes : public cudf::test::BaseFixture {}; @@ -508,6 +510,7 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } +namespace { std::unique_ptr do_agg( cudf::column_view key, cudf::column_view val, @@ -537,6 +540,7 @@ std::unique_ptr do_agg( return std::make_unique(std::move(result_columns)); } +} // namespace TEST_F(TDigestMergeTest, AllValuesAreNull) { diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index ef4b9dd9b8a..b7106e823dd 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -26,6 +26,7 @@ #include +namespace { struct dlpack_deleter { void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); } }; @@ -60,6 +61,7 @@ void validate_dtype(DLDataType const& dtype) EXPECT_EQ(1, dtype.lanes); EXPECT_EQ(sizeof(T) * 8, dtype.bits); } +} // namespace class DLPackUntypedTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 3c8db99c3c7..37a750330fa 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -56,6 +56,8 @@ using int16_wrapper = wrapper; using int64_wrapper = wrapper; using timestamp_ms_wrapper = wrapper; using bool_wrapper = wrapper; +using size_type_wrapper = wrapper; +using strings_wrapper = cudf::test::strings_column_wrapper; using cudf::data_type; using cudf::type_id; @@ -3253,6 +3255,144 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); } } + + // test list (all-null) of struct (empty) of string (empty) + { + std::string json_stringl = R"( + {"a" : [1], "c2": [1, 2]} + {} + )"; + auto lines = true; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_stringl.data(), json_stringl.size()}) + .prune_columns(true) + .experimental(true) + .lines(lines); + + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"a", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"c2", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"d", {data_type{cudf::type_id::STRING}}}, + }, + {{"d"}}}}}}}, + }, + {{"a", "c2"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a":[int64_t] + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element"); + // Make sure we have all null list "c2": [{"d": ""}] + EXPECT_EQ(result.metadata.schema_info[1].name, "c2"); + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "element"); + ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "d"); + + auto const expected0 = [&] { + auto const valids = std::vector{1, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(valids.begin(), valids.end()); + return cudf::make_lists_column(2, + size_type_wrapper{0, 1, 1}.release(), + int64_wrapper{1}.release(), + null_count, + std::move(null_mask)); + }(); + + auto const expected1 = [&] { + auto const get_structs = [] { + auto child = cudf::test::strings_column_wrapper{}; + return cudf::test::structs_column_wrapper{{child}}; + }; + auto const valids = std::vector{0, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(valids.begin(), valids.end()); + return cudf::make_lists_column(2, + size_type_wrapper{0, 0, 0}.release(), + get_structs().release(), + null_count, + std::move(null_mask)); + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected0, result.tbl->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected1, result.tbl->get_column(1).view()); + } +} + +TEST_F(JsonReaderTest, NullifyMixedList) +{ + using namespace cudf::test::iterators; + // test list + std::string json_stringl = R"( + {"c2": []} + {"c2": [{}]} + {"c2": [[]]} + {"c2": [{}, [], {}]} + {"c2": [[123], {"b": "1"}]} + {"c2": [{"x": "y"}, {"b": "1"}]} + {} + )"; + // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null + // valid 1 1 0 0 0 1 0 + // ofset 0, 0, 1, 1, 1, 1, 3, 3 + // child {null, null}, {null, null}, {1, null} + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_stringl.data(), json_stringl.size()}) + .prune_columns(true) + .experimental(true) + .lines(true); + + // struct>> eg. {"c2": [{"b": "1", "c": "2"}]} + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c2", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"b", {data_type{cudf::type_id::STRING}}}, + {"c", {data_type{cudf::type_id::STRING}}}, + }, + {{"b", "c"}}}}}}}, + }, + {{"c2"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + + // Expected: A list of struct of 2-string columns + // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null + auto get_structs = [] { + strings_wrapper child0{{"", "", "1"}, nulls_at({0, 0, 1})}; + strings_wrapper child1{{"", "", ""}, all_nulls()}; + // purge non-empty nulls in list seems to retain nullmask in struct child column + return cudf::test::structs_column_wrapper{{child0, child1}, no_nulls()}.release(); + }; + std::vector const list_nulls{1, 1, 0, 0, 0, 1, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(list_nulls.cbegin(), list_nulls.cend()); + auto const expected = cudf::make_lists_column( + 7, + cudf::test::fixed_width_column_wrapper{0, 0, 1, 1, 1, 1, 3, 3}.release(), + get_structs(), + null_count, + std::move(null_mask)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, result.tbl->get_column(0).view()); } struct JsonCompressedIOTest : public cudf::test::BaseFixture, diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 887d4fa783f..5201a46ba7d 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -34,6 +34,8 @@ namespace cuio_json = cudf::io::json; +namespace { + // Host copy of tree_meta_t struct tree_meta_t2 { std::vector node_categories; @@ -43,8 +45,6 @@ struct tree_meta_t2 { std::vector node_range_end; }; -namespace { - tree_meta_t2 to_cpu_tree(cuio_json::tree_meta_t const& d_value, rmm::cuda_stream_view stream) { return {cudf::detail::make_std_vector_async(d_value.node_categories, stream), diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index f988ae24b38..a67830a7864 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -36,6 +36,8 @@ namespace cuio_json = cudf::io::json; +namespace { + struct h_tree_meta_t { std::vector node_categories; std::vector parent_node_ids; @@ -222,6 +224,7 @@ void run_test(std::string const& input, bool enable_lines = true) // assert equality between csr and meta formats ASSERT_TRUE(iseq); } +} // namespace struct JsonColumnTreeTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 153a8a0c5aa..369376b6c95 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -1074,6 +1074,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount) } while (reader.has_next()); } +namespace { constexpr size_t input_limit_expected_file_count = 4; std::vector input_limit_get_test_names(std::string const& base_filename) @@ -1133,6 +1134,7 @@ void input_limit_test_read(std::vector const& test_filenames, CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t); } } +} // namespace struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {}; @@ -1189,6 +1191,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns) struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {}; +namespace { struct offset_gen { int const group_size; __device__ int operator()(int i) { return i * group_size; } @@ -1198,6 +1201,8 @@ template struct value_gen { __device__ T operator()(int i) { return i % 1024; } }; +} // namespace + TEST_F(ParquetChunkedReaderInputLimitTest, List) { auto base_path = temp_env->get_temp_filepath("list"); @@ -1263,6 +1268,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List) input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c); } +namespace { void tiny_list_rowgroup_test(bool just_list_col) { auto iter = thrust::make_counting_iterator(0); @@ -1320,6 +1326,7 @@ void tiny_list_rowgroup_test(bool just_list_col) CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *(result.first)); } +} // namespace TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsSingle) { @@ -1333,6 +1340,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsMixed) tiny_list_rowgroup_test(false); } +namespace { struct char_values { __device__ int8_t operator()(int i) { @@ -1341,6 +1349,8 @@ struct char_values { return index == 0 ? 'a' : (index == 1 ? 'b' : 'c'); } }; +} // namespace + TEST_F(ParquetChunkedReaderInputLimitTest, Mixed) { auto base_path = temp_env->get_temp_filepath("mixed_types"); diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu index 257c0979017..8377060b6ec 100644 --- a/cpp/tests/iterator/optional_iterator_test_numeric.cu +++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu @@ -26,16 +26,6 @@ using TestingTypes = cudf::test::NumericTypes; -namespace cudf { -// To print meanvar for debug. -// Needs to be in the cudf namespace for ADL -template -std::ostream& operator<<(std::ostream& os, cudf::meanvar const& rhs) -{ - return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] "; -}; -} // namespace cudf - template struct NumericOptionalIteratorTest : public IteratorTest {}; @@ -46,6 +36,7 @@ TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator) } TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); } +namespace { // Transformers and Operators for optional_iterator test template struct transformer_optional_meanvar { @@ -65,6 +56,7 @@ template struct optional_to_meanvar { CUDF_HOST_DEVICE inline T operator()(cuda::std::optional const& v) { return v.value_or(T{0}); } }; +} // namespace // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu index 3447aa0dde6..5f707232953 100644 --- a/cpp/tests/iterator/pair_iterator_test_numeric.cu +++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu @@ -24,16 +24,6 @@ using TestingTypes = cudf::test::NumericTypes; -namespace cudf { -// To print meanvar for debug. -// Needs to be in the cudf namespace for ADL -template -std::ostream& operator<<(std::ostream& os, cudf::meanvar const& rhs) -{ - return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] "; -}; -} // namespace cudf - template struct NumericPairIteratorTest : public IteratorTest {}; @@ -53,6 +43,7 @@ struct transformer_pair_meanvar { }; }; +namespace { struct sum_if_not_null { template CUDF_HOST_DEVICE inline thrust::pair operator()(thrust::pair const& lhs, @@ -66,6 +57,7 @@ struct sum_if_not_null { return {rhs}; } }; +} // namespace // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 37414eb3fba..c146fd2ea4e 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -33,6 +33,7 @@ #include +namespace { std::unique_ptr arrow_percentile_approx(cudf::column_view const& _values, int delta, std::vector const& percentages) @@ -315,6 +316,7 @@ cudf::data_type get_appropriate_type() if constexpr (cudf::is_fixed_point()) { return cudf::data_type{cudf::type_to_id(), -7}; } return cudf::data_type{cudf::type_to_id()}; } +} // namespace using PercentileApproxTypes = cudf::test::Concat; diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu index c8fec51e1c9..184725e17e0 100644 --- a/cpp/tests/reductions/tdigest_tests.cu +++ b/cpp/tests/reductions/tdigest_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ template struct ReductionTDigestAllTypes : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(ReductionTDigestAllTypes, cudf::test::NumericTypes); +namespace { struct reduce_op { std::unique_ptr operator()(cudf::column_view const& values, int delta) const { @@ -60,6 +61,7 @@ struct reduce_merge_op { return cudf::make_structs_column(tbl.num_rows(), std::move(cols), 0, rmm::device_buffer()); } }; +} // namespace TYPED_TEST(ReductionTDigestAllTypes, Simple) { diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp index 7133baf6df1..79ea6b7d6d4 100644 --- a/cpp/tests/streams/interop_test.cpp +++ b/cpp/tests/streams/interop_test.cpp @@ -23,9 +23,11 @@ #include +namespace { struct dlpack_deleter { void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); } }; +} // namespace struct DLPackTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 042ac44621e..8bfb17e0efd 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -44,10 +44,9 @@ TEST_F(MinHashTest, Permuted) auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(10); - auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -66,9 +65,9 @@ TEST_F(MinHashTest, Permuted) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -95,10 +94,9 @@ TEST_F(MinHashTest, PermutedWide) auto input = cudf::test::strings_column_wrapper({small, wide}); auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(20); - auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -109,9 +107,9 @@ TEST_F(MinHashTest, PermutedWide) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -132,9 +130,8 @@ TEST_F(MinHashTest, PermutedManyParameters) auto first = thrust::counting_iterator(20); // more than params_per_thread - auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -152,9 +149,9 @@ TEST_F(MinHashTest, PermutedManyParameters) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); // more than params_per_thread - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -182,15 +179,13 @@ TEST_F(MinHashTest, PermutedManyParameters) TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); - results = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -199,18 +194,16 @@ TEST_F(MinHashTest, ErrorsTest) auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); auto empty = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), - std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); auto empty64 = cudf::test::fixed_width_column_wrapper(); EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), std::invalid_argument); std::vector h_input(50000, ""); @@ -219,18 +212,16 @@ TEST_F(MinHashTest, ErrorsTest) auto const zeroes = thrust::constant_iterator(0); auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), std::overflow_error); - auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), - std::overflow_error); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), std::invalid_argument); } diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 01a042130d6..7e203086fca 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -590,6 +590,7 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists) cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); } +namespace { struct sum_functor { cudf::size_type const* s0; cudf::size_type const* s1; @@ -597,6 +598,7 @@ struct sum_functor { cudf::size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; } }; +} // namespace TEST_F(RowBitCount, Table) { diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index cfab570833b..58396115a54 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -16,29 +16,25 @@ #include -#include - -#include +#include #include class LoggerTest : public cudf::test::BaseFixture { std::ostringstream oss; - spdlog::level::level_enum prev_level; - std::vector prev_sinks; + cudf::level_enum prev_level; public: - LoggerTest() - : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} + LoggerTest() : prev_level{cudf::default_logger().level()} { - cudf::detail::logger().sinks() = {std::make_shared(oss)}; - cudf::detail::logger().set_formatter( - std::unique_ptr(new spdlog::pattern_formatter("%v"))); + cudf::default_logger().sinks().push_back(std::make_shared(oss)); + cudf::default_logger().set_pattern("%v"); } ~LoggerTest() override { - cudf::detail::logger().set_level(prev_level); - cudf::detail::logger().sinks() = prev_sinks; + cudf::default_logger().set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v"); + cudf::default_logger().set_level(prev_level); + cudf::default_logger().sinks().pop_back(); } void clear_sink() { oss.str(""); } @@ -47,32 +43,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::detail::logger().critical("crit msg"); + cudf::default_logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::detail::logger().trace("trace"); - cudf::detail::logger().debug("debug"); - cudf::detail::logger().info("info"); - cudf::detail::logger().warn("warn"); - cudf::detail::logger().error("error"); - cudf::detail::logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); + cudf::default_logger().trace("trace"); + cudf::default_logger().debug("debug"); + cudf::default_logger().info("info"); + cudf::default_logger().warn("warn"); + cudf::default_logger().error("error"); + cudf::default_logger().critical("critical"); + ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::detail::logger().set_level(spdlog::level::warn); - cudf::detail::logger().info("info"); - cudf::detail::logger().warn("warn"); + cudf::default_logger().set_level(cudf::level_enum::warn); + cudf::default_logger().info("info"); + cudf::default_logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::detail::logger().set_level(spdlog::level::debug); - cudf::detail::logger().trace("trace"); - cudf::detail::logger().debug("debug"); + cudf::default_logger().set_level(cudf::level_enum::debug); + cudf::default_logger().trace("trace"); + cudf::default_logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index 4086c5a91bb..8e5129dfbd2 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -37,6 +37,7 @@ #include #include +namespace { template struct ChronoColumnTest : public cudf::test::BaseFixture { cudf::size_type size() { return cudf::size_type(100); } @@ -72,6 +73,7 @@ struct compare_chrono_elements_to_primitive_representation { return primitive == dur.count(); } }; +} // namespace TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes); @@ -103,6 +105,7 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation) *cudf::column_device_view::create(chrono_col)})); } +namespace { template struct compare_chrono_elements { cudf::binary_operator comp; @@ -129,6 +132,7 @@ struct compare_chrono_elements { } } }; +} // namespace TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) { diff --git a/dependencies.yaml b/dependencies.yaml index 044c7d187b3..44767f1e9d3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -413,7 +413,6 @@ dependencies: - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 - - spdlog>=1.14.1,<1.15 depends_on_nvcomp: common: - output_types: conda @@ -679,10 +678,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.6.2,<13.0a0 - matrix: {cuda: "11.*"} packages: &run_pylibcudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.8.5,<12.0a0 - {matrix: null, packages: *run_pylibcudf_packages_all_cu11} run_cudf: common: @@ -705,10 +704,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.6.2,<13.0a0 - matrix: {cuda: "11.*"} packages: &run_cudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.8.5,<12.0a0 - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index fbb9ca4b128..7aa8f9f4a1c 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -594,6 +594,8 @@ def on_missing_reference(app, env, node, contnode): # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), + ("py:class", "np.uint32"), + ("py:class", "np.uint64"), ] diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index ed029c918e4..d1cc0cc96fe 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource) * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, * {@link RmmAllocationMode#ARENA}, - * {@link RmmAllocationMode#CUDA_ASYNC} and + * {@link RmmAllocationMode#CUDA_ASYNC}, + * {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0; boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0; boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0; + boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0; boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0; if (isAsync && isManaged) { @@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, } else if (isAsync) { resource = new RmmLimitingResourceAdaptor<>( new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512); + } else if (isAsyncFabric) { + resource = new RmmLimitingResourceAdaptor<>( + new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512); } else if (isManaged) { resource = new RmmManagedMemoryResource(); } else { @@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) { private static native long allocInternal(long size, long stream) throws RmmException; - static native void free(long ptr, long length, long stream) throws RmmException; /** @@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle, static native void releaseArenaMemoryResource(long handle); - static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException; + static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException; static native void releaseCudaAsyncMemoryResource(long handle); @@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path, static native void releaseLoggingResourceAdaptor(long handle); - static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException; static native void releaseTrackingResourceAdaptor(long handle); diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 966c21bee22..3f7bc1fae76 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,4 +36,9 @@ public class RmmAllocationMode { * Use CUDA async suballocation strategy */ public static final int CUDA_ASYNC = 0x00000008; + /** + * Use CUDA async suballocation strategy with fabric handles that are + * peer accessible with read-write access + */ + public static final int CUDA_ASYNC_FABRIC = 0x00000010; } diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java index fa1f13cb7ed..cf4936e2e24 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java +++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource { * @param releaseThreshold size in bytes for when memory is released back to cuda */ public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) { + this(size, releaseThreshold, false); + } + + /** + * Create a new async memory resource + * @param size the initial size of the pool + * @param releaseThreshold size in bytes for when memory is released back to cuda + * @param fabric if true request peer read+write accessible fabric handles when + * creating the pool + */ + public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) { this.size = size; this.releaseThreshold = releaseThreshold; - handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold); + handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric); } @Override diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java index 286b5c208c9..f3155bc5860 100644 --- a/java/src/main/java/ai/rapids/cudf/Scalar.java +++ b/java/src/main/java/ai/rapids/cudf/Scalar.java @@ -521,13 +521,28 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host private static native long makeStructScalar(long[] viewHandles, boolean isValid); private static native long repeatString(long scalarHandle, int repeatTimes); - Scalar(DType type, long scalarHandle) { + /** + * Constructor to create a scalar from a native handle and a type. + * + * @param type The type of the scalar + * @param scalarHandle The native handle (pointer address) to the scalar data + */ + public Scalar(DType type, long scalarHandle) { this.type = type; this.offHeap = new OffHeapState(scalarHandle); MemoryCleaner.register(this, offHeap); incRefCount(); } + /** + * Get the native handle (native pointer address) for the scalar. + * + * @return The native handle + */ + public long getScalarHandle() { + return offHeap.scalarHandle; + } + /** * Increment the reference count for this scalar. You need to call close on this * to decrement the reference count again. @@ -542,10 +557,6 @@ public synchronized Scalar incRefCount() { return this; } - long getScalarHandle() { - return offHeap.scalarHandle; - } - /** * Free the memory associated with a scalar. */ diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 23c7b7fb243..8c733018fa7 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -772,14 +772,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv CATCH_STD(env, ) } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env, - jclass clazz, - jlong init, - jlong release) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource( + JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric) { try { cudf::jni::auto_set_device(env); - auto ret = new rmm::mr::cuda_async_memory_resource(init, release); + + auto handle_type = + fabric ? std::optional{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric} + : std::nullopt; + + auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type); + return reinterpret_cast(ret); } CATCH_STD(env, 0) diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 4e1bf860872..efe96ff6c3e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -13,23 +13,8 @@ # ============================================================================= set(cython_sources - column.pyx - copying.pyx - csv.pyx - groupby.pyx - interop.pyx - orc.pyx - parquet.pyx - reduce.pyx - round.pyx - scalar.pyx - sort.pyx - stream_compaction.pyx - string_casting.pyx - strings_udf.pyx - transform.pyx - types.pyx - utils.pyx + column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx + sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) @@ -48,4 +33,3 @@ target_link_libraries(interop PUBLIC nanoarrow) add_subdirectory(io) add_subdirectory(nvtext) -add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index c79d5100622..52e9b89da7b 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -7,14 +7,11 @@ groupby, interop, nvtext, - orc, parquet, reduce, - round, sort, stream_compaction, string_casting, - strings, strings_udf, ) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 1f3f03f4be1..a7ea9c25a86 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import pickle - from libcpp cimport bool import pylibcudf @@ -358,14 +356,13 @@ class PackedColumns(Serializable): header["index-names"] = self.index_names header["metadata"] = self._metadata.tobytes() for name, dtype in self.column_dtypes.items(): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() self.column_dtypes[name] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) header["column-dtypes"] = self.column_dtypes - header["type-serialized"] = pickle.dumps(type(self)) return header, frames @classmethod @@ -373,9 +370,9 @@ class PackedColumns(Serializable): column_dtypes = {} for name, dtype in header["column-dtypes"].items(): dtype_header, (start, stop) = dtype - column_dtypes[name] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize(dtype_header, frames[start:stop]) + column_dtypes[name] = Serializable.device_deserialize( + dtype_header, frames[start:stop] + ) return cls( plc.contiguous_split.pack( plc.contiguous_split.unpack_from_memoryviews( diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 96504ebdd66..9b8bab012e2 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -13,9 +13,6 @@ from pylibcudf.libcudf.io.types cimport ( from cudf._lib.column cimport Column -cdef sink_info make_sinks_info( - list src, vector[unique_ptr[data_sink]] & data) except* -cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* cdef add_df_col_struct_names( df, child_names_dict @@ -26,7 +23,8 @@ cdef update_col_struct_field_names( ) cdef update_struct_field_names( table, - vector[column_name_info]& schema_info) + vector[column_name_info]& schema_info +) cdef Column update_column_struct_field_names( Column col, column_name_info& info diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index f23980b387a..df4675be599 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -1,97 +1,16 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cpython.buffer cimport PyBUF_READ -from cpython.memoryview cimport PyMemoryView_FromMemory -from libcpp.memory cimport unique_ptr + from libcpp.string cimport string -from libcpp.utility cimport move + from libcpp.vector cimport vector -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - sink_info, -) +from pylibcudf.libcudf.io.types cimport column_name_info from cudf._lib.column cimport Column -import codecs -import io -import os - from cudf.core.dtypes import StructDtype -# Converts the Python sink input to libcudf IO sink_info. -cdef sink_info make_sinks_info( - list src, vector[unique_ptr[data_sink]] & sink -) except*: - cdef vector[data_sink *] data_sinks - cdef vector[string] paths - if isinstance(src[0], io.StringIO): - data_sinks.reserve(len(src)) - for s in src: - sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s))) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], io.TextIOBase): - data_sinks.reserve(len(src)) - for s in src: - # Files opened in text mode expect writes to be str rather than - # bytes, which requires conversion from utf-8. If the underlying - # buffer is utf-8, we can bypass this conversion by writing - # directly to it. - if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}: - raise NotImplementedError(f"Unsupported encoding {s.encoding}") - sink.push_back( - unique_ptr[data_sink](new iobase_data_sink(s.buffer)) - ) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], io.IOBase): - data_sinks.reserve(len(src)) - for s in src: - sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s))) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], (basestring, os.PathLike)): - paths.reserve(len(src)) - for s in src: - paths.push_back( os.path.expanduser(s).encode()) - return sink_info(move(paths)) - else: - raise TypeError("Unrecognized input type: {}".format(type(src))) - - -cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*: - cdef vector[unique_ptr[data_sink]] datasinks - cdef sink_info info = make_sinks_info([src], datasinks) - if not datasinks.empty(): - sink.swap(datasinks[0]) - return info - - -# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you -# write from cudf to any python file-like object (File/BytesIO/SocketIO etc) -cdef cppclass iobase_data_sink(data_sink): - object buf - - iobase_data_sink(object buf_): - this.buf = buf_ - - void host_write(const void * data, size_t size) with gil: - if isinstance(buf, io.StringIO): - buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ) - .tobytes().decode()) - else: - buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ)) - - void flush() with gil: - buf.flush() - - size_t bytes_written() with gil: - return buf.tell() - - cdef add_df_col_struct_names(df, child_names_dict): for name, child_names in child_names_dict.items(): col = df._data[name] diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 25cfcf99ca6..9f2b3f92502 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,19 +10,9 @@ from pylibcudf import nvtext @acquire_spill_lock() -def minhash(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): +def minhash(Column input, uint32_t seed, Column a, Column b, int width): return Column.from_pylibcudf( - nvtext.minhash.minhash_permuted( + nvtext.minhash.minhash( input.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -33,19 +23,9 @@ def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width) @acquire_spill_lock() -def minhash64(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): +def minhash64(Column input, uint64_t seed, Column a, Column b, int width): return Column.from_pylibcudf( - nvtext.minhash.minhash64_permuted( + nvtext.minhash.minhash64( input.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -53,21 +33,3 @@ def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int widt width, ) ) - - -@acquire_spill_lock() -def word_minhash(Column input, Column seeds): - result = nvtext.minhash.word_minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def word_minhash64(Column input, Column seeds): - result = nvtext.minhash.word_minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx deleted file mode 100644 index c829cac6409..00000000000 --- a/python/cudf/cudf/_lib/orc.pyx +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int64_t -from libcpp cimport bool, int -from libcpp.map cimport map -from libcpp.string cimport string -from libcpp.vector cimport vector -import itertools -from collections import OrderedDict - -try: - import ujson as json -except ImportError: - import json - -cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport update_col_struct_field_names -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc - -import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES -from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from cudf.core.buffer import acquire_spill_lock -from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata -from pylibcudf.io.orc cimport OrcChunkedWriter - -# TODO: Consider inlining this function since it seems to only be used in one place. -cpdef read_parsed_orc_statistics(filepath_or_buffer): - """ - Cython function to call into libcudf API, see `read_parsed_orc_statistics`. - - See Also - -------- - cudf.io.orc.read_orc_statistics - """ - - parsed = ( - plc.io.orc.read_parsed_orc_statistics( - plc.io.SourceInfo([filepath_or_buffer]) - ) - ) - - return parsed.column_names, parsed.file_stats, parsed.stripes_stats - - -cpdef read_orc(object filepaths_or_buffers, - object columns=None, - object stripes=None, - object skip_rows=None, - object num_rows=None, - bool use_index=True, - object timestamp_type=None): - """ - Cython function to call into libcudf API, see `read_orc`. - - See Also - -------- - cudf.read_orc - - Notes - ----- - Currently this function only considers the metadata of the first file in the list of - filepaths_or_buffers. - """ - - if columns is not None: - columns = [str(col) for col in columns] - - tbl_w_meta = plc.io.orc.read_orc( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - stripes, - get_skiprows_arg(skip_rows), - get_num_rows_arg(num_rows), - use_index, - plc.types.DataType( - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[ - cudf.dtype(timestamp_type) - ] - ) - ) - - names = tbl_w_meta.column_names(include_children=False) - - actual_index_names, col_names, is_range_index, reset_index_name, \ - range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data, - names, - skip_rows, - num_rows) - - if columns is not None and (isinstance(columns, list) and len(columns) == 0): - # When `columns=[]`, index needs to be - # established, but not the columns. - nrows = tbl_w_meta.tbl.num_rows() - return {}, cudf.RangeIndex(nrows) - - data, index = data_from_pylibcudf_io( - tbl_w_meta, - col_names if columns is None else names, - actual_index_names - ) - - if is_range_index: - index = range_idx - elif reset_index_name: - index.names = [None] * len(index.names) - - child_name_values = tbl_w_meta.child_names.values() - - data = { - name: update_col_struct_field_names( - col, child_names - ) - for (name, col), child_names in zip(data.items(), child_name_values) - } - - return data, index - - -def _get_comp_type(object compression): - if compression is None or compression is False: - return plc.io.types.CompressionType.NONE - - compression = str(compression).upper() - if compression == "SNAPPY": - return plc.io.types.CompressionType.SNAPPY - elif compression == "ZLIB": - return plc.io.types.CompressionType.ZLIB - elif compression == "ZSTD": - return plc.io.types.CompressionType.ZSTD - elif compression == "LZ4": - return plc.io.types.CompressionType.LZ4 - else: - raise ValueError(f"Unsupported `compression` type {compression}") - - -cdef tuple _get_index_from_metadata( - vector[map[string, string]] user_data, - object names, - object skip_rows, - object num_rows): - - meta = None - index_col = None - is_range_index = False - reset_index_name = False - range_idx = None - - if user_data.size() > 0: - json_str = user_data[0][b'pandas'].decode('utf-8') - if json_str != "": - meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: - index_col = meta['index_columns'] - if isinstance(index_col[0], dict) and \ - index_col[0]['kind'] == 'range': - is_range_index = True - else: - index_col_names = OrderedDict() - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = \ - c['name'] or c['field_name'] - if c['name'] is None: - reset_index_name = True - - actual_index_names = None - if index_col is not None and len(index_col) > 0: - if is_range_index: - range_index_meta = index_col[0] - range_idx = cudf.RangeIndex( - start=range_index_meta['start'], - stop=range_index_meta['stop'], - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - if skip_rows is not None: - range_idx = range_idx[skip_rows:] - if num_rows is not None: - range_idx = range_idx[:num_rows] - else: - actual_index_names = list(index_col_names.values()) - names = names[len(actual_index_names):] - - return ( - actual_index_names, - names, - is_range_index, - reset_index_name, - range_idx - ) - - -def _get_orc_stat_freq(str statistics): - """ - Convert ORC statistics terms to CUDF convention: - - ORC "STRIPE" == CUDF "ROWGROUP" - - ORC "ROWGROUP" == CUDF "PAGE" - """ - statistics = str(statistics).upper() - if statistics == "NONE": - return plc.io.types.StatisticsFreq.STATISTICS_NONE - elif statistics == "STRIPE": - return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP - elif statistics == "ROWGROUP": - return plc.io.types.StatisticsFreq.STATISTICS_PAGE - else: - raise ValueError(f"Unsupported `statistics_freq` type {statistics}") - - -@acquire_spill_lock() -def write_orc( - table, - object path_or_buf, - object compression="snappy", - str statistics="ROWGROUP", - object stripe_size_bytes=None, - object stripe_size_rows=None, - object row_index_stride=None, - object cols_as_map_type=None, - object index=None -): - """ - Cython function to call into libcudf API, see `cudf::io::write_orc`. - - See Also - -------- - cudf.read_orc - """ - user_data = {} - user_data["pandas"] = generate_pandas_metadata(table, index) - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - columns = table._columns if table._index is None else [ - *table.index._columns, *table._columns - ] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) - ) - num_index_cols_meta = len(table._index.names) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - tbl_meta = TableInputMetadata(plc_table) - num_index_cols_meta = 0 - - if cols_as_map_type is not None: - cols_as_map_type = set(cols_as_map_type) - - for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.column_metadata[i].set_name(name) - _set_col_children_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - (cols_as_map_type is not None) - and (name in cols_as_map_type), - ) - - options = ( - plc.io.orc.OrcWriterOptions.builder( - plc.io.SinkInfo([path_or_buf]), plc_table - ) - .metadata(tbl_meta) - .key_value_metadata(user_data) - .compression(_get_comp_type(compression)) - .enable_statistics(_get_orc_stat_freq(statistics)) - .build() - ) - if stripe_size_bytes is not None: - options.set_stripe_size_bytes(stripe_size_bytes) - if stripe_size_rows is not None: - options.set_stripe_size_rows(stripe_size_rows) - if row_index_stride is not None: - options.set_row_index_stride(row_index_stride) - - plc.io.orc.write_orc(options) - - -cdef int64_t get_skiprows_arg(object arg) except*: - arg = 0 if arg is None else arg - if not isinstance(arg, int) or arg < 0: - raise TypeError("skiprows must be an int >= 0") - return arg - -cdef int64_t get_num_rows_arg(object arg) except*: - arg = -1 if arg is None else arg - if not isinstance(arg, int) or arg < -1: - raise TypeError("num_rows must be an int >= -1") - return arg - - -cdef class ORCWriter: - """ - ORCWriter lets you you incrementally write out a ORC file from a series - of cudf tables - - See Also - -------- - cudf.io.orc.to_orc - """ - cdef bool initialized - cdef OrcChunkedWriter writer - cdef SinkInfo sink - cdef str statistics - cdef object compression - cdef object index - cdef TableInputMetadata tbl_meta - cdef object cols_as_map_type - cdef object stripe_size_bytes - cdef object stripe_size_rows - cdef object row_index_stride - - def __cinit__(self, - object path, - object index=None, - object compression="snappy", - str statistics="ROWGROUP", - object cols_as_map_type=None, - object stripe_size_bytes=None, - object stripe_size_rows=None, - object row_index_stride=None): - self.sink = plc.io.SinkInfo([path]) - self.statistics = statistics - self.compression = compression - self.index = index - self.cols_as_map_type = cols_as_map_type \ - if cols_as_map_type is None else set(cols_as_map_type) - self.stripe_size_bytes = stripe_size_bytes - self.stripe_size_rows = stripe_size_rows - self.row_index_stride = row_index_stride - self.initialized = False - - def write_table(self, table): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state(table) - - keep_index = self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex) - ) - if keep_index: - columns = [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - else: - columns = [col.to_pylibcudf(mode="read") for col in table._columns] - - self.writer.write(plc.Table(columns)) - - def close(self): - if not self.initialized: - return - - self.writer.close() - - def __dealloc__(self): - self.close() - - def _initialize_chunked_state(self, table): - """ - Prepare all the values required to build the - chunked_orc_writer_options anb creates a writer""" - - num_index_cols_meta = 0 - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in table._columns - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name( - idx_name - ) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain( - table.index._columns, table._columns - ) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - self.tbl_meta.column_metadata[0].set_name( - table._index.name - ) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name) - _set_col_children_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - (self.cols_as_map_type is not None) - and (name in self.cols_as_map_type), - ) - - user_data = {} - pandas_metadata = generate_pandas_metadata(table, self.index) - user_data["pandas"] = pandas_metadata - - options = ( - plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(user_data) - .compression(_get_comp_type(self.compression)) - .enable_statistics(_get_orc_stat_freq(self.statistics)) - .build() - ) - if self.stripe_size_bytes is not None: - options.set_stripe_size_bytes(self.stripe_size_bytes) - if self.stripe_size_rows is not None: - options.set_stripe_size_rows(self.stripe_size_rows) - if self.row_index_stride is not None: - options.set_row_index_stride(self.row_index_stride) - - self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) - - self.initialized = True - -cdef _set_col_children_metadata(Column col, - ColumnInMetadata col_meta, - list_column_as_map=False): - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name) - _set_col_children_metadata( - child_col, col_meta.child(i), list_column_as_map - ) - elif isinstance(col.dtype, cudf.ListDtype): - if list_column_as_map: - col_meta.set_list_column_as_map() - _set_col_children_metadata( - col.children[cpp_lists_column_view.child_column_index], - col_meta.child(cpp_lists_column_view.child_column_index), - list_column_as_map - ) - else: - return diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c77c9875342..00c434ae374 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -20,11 +20,8 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from libc.stdint cimport int64_t, uint8_t +from libc.stdint cimport int64_t from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector from pylibcudf.expressions cimport Expression from pylibcudf.io.parquet cimport ChunkedParquetReader @@ -47,47 +44,6 @@ from pylibcudf cimport Table from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata from pylibcudf.io.parquet cimport ParquetChunkedWriter -from cython.operator cimport dereference - - -cdef class BufferArrayFromVector: - cdef Py_ssize_t length - cdef unique_ptr[vector[uint8_t]] in_vec - - # these two things declare part of the buffer interface - cdef Py_ssize_t shape[1] - cdef Py_ssize_t strides[1] - - @staticmethod - cdef BufferArrayFromVector from_unique_ptr( - unique_ptr[vector[uint8_t]] in_vec - ): - cdef BufferArrayFromVector buf = BufferArrayFromVector() - buf.in_vec = move(in_vec) - buf.length = dereference(buf.in_vec).size() - return buf - - def __getbuffer__(self, Py_buffer *buffer, int flags): - cdef Py_ssize_t itemsize = sizeof(uint8_t) - - self.shape[0] = self.length - self.strides[0] = 1 - - buffer.buf = dereference(self.in_vec).data() - - buffer.format = NULL # byte - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.length * itemsize # product(shape) * itemsize - buffer.ndim = 1 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - - def __releasebuffer__(self, Py_buffer *buffer): - pass def _parse_metadata(meta): @@ -205,7 +161,7 @@ cdef object _process_metadata(object df, else: start = range_index_meta["start"] + skip_rows stop = range_index_meta["stop"] - if nrows != -1: + if nrows > -1: stop = start + nrows idx = cudf.RangeIndex( start=start, @@ -256,16 +212,27 @@ def read_parquet_chunked( # (see read_parquet) allow_range_index = columns is not None and len(columns) != 0 + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + reader = ChunkedParquetReader( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - row_groups, - use_pandas_metadata=use_pandas_metadata, + options, chunk_read_limit=chunk_read_limit, pass_read_limit=pass_read_limit, - skip_rows=skip_rows, - nrows=nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, ) tbl_w_meta = reader.read_chunk() @@ -325,19 +292,26 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if columns is not None and len(columns) == 0 or filters: allow_range_index = False - # Read Parquet - - tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - row_groups, - filters, - convert_strings_to_categories = False, - use_pandas_metadata = use_pandas_metadata, - skip_rows = skip_rows, - nrows = nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + if filters is not None: + options.set_filter(filters) + + tbl_w_meta = plc.io.parquet.read_parquet(options) df = cudf.DataFrame._from_data( *data_from_pylibcudf_io(tbl_w_meta) diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx deleted file mode 100644 index f961c09e6f6..00000000000 --- a/python/cudf/cudf/_lib/round.pyx +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc -from pylibcudf.round import RoundingMethod - - -@acquire_spill_lock() -def round(Column input_col, int decimal_places=0, how="half_even"): - """ - Round column values to the given number of decimal places - - Parameters - ---------- - input_col : Column whose values will be rounded - decimal_places : The number or decimal places to round to - - Returns - ------- - A Column with values rounded to the given number of decimal places - """ - if how not in {"half_even", "half_up"}: - raise ValueError("'how' must be either 'half_even' or 'half_up'") - - how = ( - RoundingMethod.HALF_EVEN if how == "half_even" - else RoundingMethod.HALF_UP - ) - - return Column.from_pylibcudf( - plc.round.round( - input_col.to_pylibcudf(mode="read"), - decimal_places, - how - ) - ) diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt deleted file mode 100644 index dca9c4cc3fc..00000000000 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -add_subdirectory(convert) -add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index b795c54c112..b9095a22a42 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,10 +9,6 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, - minhash64_permuted, - minhash_permuted, - word_minhash, - word_minhash64, ) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces @@ -32,18 +28,3 @@ detokenize, tokenize_with_vocabulary, ) -from cudf._lib.strings.convert.convert_fixed_point import to_decimal -from cudf._lib.strings.convert.convert_floats import is_float -from cudf._lib.strings.convert.convert_integers import is_integer -from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.split.partition import partition, rpartition -from cudf._lib.strings.split.split import ( - rsplit, - rsplit_re, - rsplit_record, - rsplit_record_re, - split, - split_re, - split_record, - split_record_re, -) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt deleted file mode 100644 index e8a76b476a8..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx - convert_lists.pyx convert_urls.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx deleted file mode 100644 index 96dcd021c3b..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import pylibcudf as plc - - -@acquire_spill_lock() -def from_decimal(Column input_col): - """ - Converts a `Decimal64Column` to a `StringColumn`. - - Parameters - ---------- - input_col : input column of type decimal - - Returns - ------- - A column of strings representing the input decimal values. - """ - plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def to_decimal(Column input_col, object out_type): - """ - Returns a `Decimal64Column` from the provided `StringColumn` - using the scale in the `out_type`. - - Parameters - ---------- - input_col : input column of type string - out_type : The type and scale of the decimal column expected - - Returns - ------- - A column of decimals parsed from the string values. - """ - plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( - input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(out_type), - ) - result = Column.from_pylibcudf(plc_column) - result.dtype.precision = out_type.precision - return result - - -@acquire_spill_lock() -def is_fixed_point(Column input_col, object dtype): - """ - Returns a Column of boolean values with True for `input_col` - that have fixed-point characters. The output row also has a - False value if the corresponding string would cause an integer - overflow. The scale of the `dtype` is used to determine overflow - in the output row. - - Parameters - ---------- - input_col : input column of type string - dtype : The type and scale of a decimal column - - Returns - ------- - A Column of booleans indicating valid decimal conversion. - """ - plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( - input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(dtype), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx deleted file mode 100644 index 5da6e3f10cc..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def is_float(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have floats. - """ - plc_column = plc.strings.convert.convert_floats.is_float( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx deleted file mode 100644 index 50113347ccb..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def is_integer(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have integers. - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_integers.is_integer( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx deleted file mode 100644 index 3a2cb4bd5c7..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def format_list_column(Column source_list, Column separators): - """ - Format a list column of strings into a strings column. - - Parameters - ---------- - input_col : input column of type list with strings child. - - separators: strings used for formatting (', ', '[', ']') - - Returns - ------- - Formatted strings column - """ - plc_column = plc.strings.convert.convert_lists.format_list_column( - source_list.to_pylibcudf(mode="read"), - as_device_scalar("None").c_value, - separators.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx deleted file mode 100644 index d5c2f771970..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def url_decode(Column source_strings): - """ - Decode each string in column. No format checking is performed. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL decoded string column - """ - plc_column = plc.strings.convert.convert_urls.url_decode( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def url_encode(Column source_strings): - """ - Encode each string in column. No format checking is performed. - All characters are encoded except for ASCII letters, digits, - and these characters: '.','_','-','~'. Encoding converts to - hex using UTF-8 encoded bytes. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL encoded string column - """ - plc_column = plc.strings.convert.convert_urls.url_encode( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt deleted file mode 100644 index 4ede0a2fac5..00000000000 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources partition.pyx split.pyx) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx deleted file mode 100644 index 5319addc41c..00000000000 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def partition(Column source_strings, - object py_delimiter): - """ - Returns data by splitting the `source_strings` - column at the first occurrence of the specified `py_delimiter`. - """ - plc_table = plc.strings.split.partition.partition( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rpartition(Column source_strings, - object py_delimiter): - """ - Returns a Column by splitting the `source_strings` - column at the last occurrence of the specified `py_delimiter`. - """ - plc_table = plc.strings.split.partition.rpartition( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx deleted file mode 100644 index 4ec6c7073d8..00000000000 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def split(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - plc_table = plc.strings.split.split.split( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def split_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - plc_column = plc.strings.split.split.split_record( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def rsplit(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - plc_table = plc.strings.split.split.rsplit( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rsplit_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - plc_column = plc.strings.split.split.rsplit_record( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def split_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - plc_table = plc.strings.split.split.split_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rsplit_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - plc_table = plc.strings.split.split.rsplit_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def split_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - plc_column = plc.strings.split.split.split_record_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def rsplit_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - plc_column = plc.strings.split.split.rsplit_record_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index dd2fafbe07f..83f0cb850a5 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -1,7 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. from libc.stdint cimport uint8_t, uint16_t, uintptr_t - from pylibcudf.libcudf.strings_udf cimport ( get_character_cases_table as cpp_get_character_cases_table, get_character_flags_table as cpp_get_character_flags_table, @@ -27,6 +26,7 @@ from rmm.librmm.device_buffer cimport device_buffer from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.column cimport Column +from pylibcudf cimport Column as plc_Column def get_cuda_build_version(): @@ -52,9 +52,9 @@ def column_from_udf_string_array(DeviceBuffer d_buffer): c_result = move(cpp_column_from_udf_string_array(data, size)) cpp_free_udf_string_array(data, size) - result = Column.from_unique_ptr(move(c_result)) - - return result + return Column.from_pylibcudf( + plc_Column.from_libcudf(move(c_result)) + ) def get_character_flags_table_ptr(): diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx deleted file mode 100644 index a163bb07888..00000000000 --- a/python/cudf/cudf/_lib/transform.pyx +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from numba.np import numpy_support - -import cudf -from cudf.core.buffer import acquire_spill_lock, as_buffer -from cudf.utils import cudautils - -from pylibcudf cimport transform as plc_transform -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def bools_to_mask(Column col): - """ - Given an int8 (boolean) column, compress the data from booleans to bits and - return a Buffer - """ - mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) - return as_buffer(mask) - - -@acquire_spill_lock() -def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): - """ - Given a mask buffer, returns a boolean column representng bit 0 -> False - and 1 -> True within range of [begin_bit, end_bit), - """ - if not isinstance(mask_buffer, cudf.core.buffer.Buffer): - raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer.Buffer") - plc_column = plc_transform.mask_to_bools( - mask_buffer.get_ptr(mode="read"), begin_bit, end_bit - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def nans_to_nulls(Column input): - mask, _ = plc_transform.nans_to_nulls( - input.to_pylibcudf(mode="read") - ) - return as_buffer(mask) - - -@acquire_spill_lock() -def transform(Column input, op): - nb_type = numpy_support.from_dtype(input.dtype) - nb_signature = (nb_type,) - compiled_op = cudautils.compile_udf(op, nb_signature) - np_dtype = cudf.dtype(compiled_op[1]) - - plc_column = plc_transform.transform( - input.to_pylibcudf(mode="read"), - compiled_op[0], - plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), - True - ) - return Column.from_pylibcudf(plc_column) - - -def table_encode(list source_columns): - plc_table, plc_column = plc_transform.encode( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) - ) - - return ( - [Column.from_pylibcudf(col) for col in plc_table.columns()], - Column.from_pylibcudf(plc_column) - ) - - -def one_hot_encode(Column input_column, Column categories): - plc_table = plc_transform.one_hot_encode( - input_column.to_pylibcudf(mode="read"), - categories.to_pylibcudf(mode="read"), - ) - result_columns = [ - Column.from_pylibcudf(col, data_ptr_exposed=True) - for col in plc_table.columns() - ] - result_labels = [ - x if x is not None else '' - for x in categories.to_arrow().to_pylist() - ] - return dict(zip(result_labels, result_columns)) - - -@acquire_spill_lock() -def compute_column(list columns, tuple column_names, str expr): - """Compute a new column by evaluating an expression on a set of columns. - - Parameters - ---------- - columns : list - The set of columns forming the table to evaluate the expression on. - column_names : tuple[str] - The names associated with each column. These names are necessary to map - column names in the expression to indices in the provided list of - columns, which are what will be used by libcudf to evaluate the - expression on the table. - expr : str - The expression to evaluate. - """ - result = plc_transform.compute_column( - plc.Table([col.to_pylibcudf(mode="read") for col in columns]), - plc.expressions.to_expression(expr, column_names), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 6b3f10e1806..ff032656f80 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column - +from pylibcudf cimport Column as plc_Column try: import ujson as json except ImportError: @@ -223,10 +223,11 @@ cdef columns_from_unique_ptr( cdef size_t i - columns = [Column.from_unique_ptr(move(dereference(it+i))) - for i in range(c_columns.size())] - - return columns + return [ + Column.from_pylibcudf( + plc_Column.from_libcudf(move(dereference(it+i))) + ) for i in range(c_columns.size()) + ] cpdef columns_from_pylibcudf_table(tbl): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2df154ee112..1b6152b81ca 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2,7 +2,6 @@ from __future__ import annotations -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any, Literal @@ -330,13 +329,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - @classmethod - def deserialize(cls, header, frames): - # Dispatch deserialization to the appropriate index type in case - # deserialization is ever attempted with the base class directly. - idx_type = pickle.loads(header["type-serialized"]) - return idx_type.deserialize(header, frames) - @property def names(self): """ diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index ce6bb83bc77..c8ea03b04fe 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,8 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import pickle - import numpy import cudf @@ -22,6 +20,14 @@ class Serializable: latter converts back from that representation into an equivalent object. """ + # A mapping from class names to the classes themselves. This is used to + # reconstruct the correct class when deserializing an object. + _name_type_map: dict = {} + + def __init_subclass__(cls, /, **kwargs): + super().__init_subclass__(**kwargs) + cls._name_type_map[cls.__name__] = cls + def serialize(self): """Generate an equivalent serializable representation of an object. @@ -98,7 +104,7 @@ def device_serialize(self): ) for f in frames ) - header["type-serialized"] = pickle.dumps(type(self)) + header["type-serialized-name"] = type(self).__name__ header["is-cuda"] = [ hasattr(f, "__cuda_array_interface__") for f in frames ] @@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames): :meta private: """ - typ = pickle.loads(header["type-serialized"]) + typ = cls._name_type_map[header["type-serialized-name"]] frames = [ cudf.core.buffer.as_buffer(f) if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) + for c, f in zip(header["is-cuda"], frames, strict=True) ] return typ.deserialize(header, frames) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index ffa306bf93f..625938ca168 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import math -import pickle import weakref from types import SimpleNamespace from typing import TYPE_CHECKING, Any, Literal @@ -432,8 +431,7 @@ def serialize(self) -> tuple[dict, list]: second element is a list containing single frame. """ header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 frames = [self] return header, frames @@ -460,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self: if isinstance(frame, cls): return frame # The frame is already deserialized - owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"]) + owner_type: BufferOwner = Serializable._name_type_map[ + header["owner-type-serialized-name"] + ] if hasattr(frame, "__cuda_array_interface__"): owner = owner_type.from_device_memory(frame, exposed=False) else: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 7305ff651c6..cbb65229933 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import collections.abc -import pickle import time import weakref from threading import RLock @@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} frames: list[Buffer | memoryview] with self._owner.lock: - header["type-serialized"] = pickle.dumps(self.__class__) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 if self.is_spilled: frames = [self.memoryview()] diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6a4f7919fb5..a0cf38c6f51 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -13,7 +13,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.transform import bools_to_mask from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods @@ -775,12 +774,11 @@ def to_pandas( raise NotImplementedError(f"{arrow_type=} is not implemented.") if self.categories.dtype.kind == "f": - new_mask = bools_to_mask(self.notnull()) col = type(self)( data=self.data, # type: ignore[arg-type] size=self.size, dtype=self.dtype, - mask=new_mask, + mask=self.notnull().fillna(False).as_mask(), children=self.children, ) else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 62b4ec5e8b8..0cf5884daaf 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,7 +2,6 @@ from __future__ import annotations -import pickle from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -32,7 +31,6 @@ drop_duplicates, drop_nulls, ) -from cudf._lib.transform import bools_to_mask from cudf._lib.types import size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -373,10 +371,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + @acquire_spill_lock() def _get_mask_as_column(self) -> ColumnBase: - return libcudf.transform.mask_to_bools( - self.base_mask, self.offset, self.offset + len(self) + plc_column = plc.transform.mask_to_bools( + self.base_mask.get_ptr(mode="read"), # type: ignore[union-attr] + self.offset, + self.offset + len(self), ) + return type(self).from_pylibcudf(plc_column) @cached_property def memory_usage(self) -> int: @@ -981,11 +983,14 @@ def as_mask(self) -> Buffer: ------- Buffer """ - if self.has_nulls(): raise ValueError("Column must have no nulls.") - return bools_to_mask(self) + with acquire_spill_lock(): + mask, _ = plc.transform.bools_to_mask( + self.to_pylibcudf(mode="read") + ) + return as_buffer(mask) @property def is_unique(self) -> bool: @@ -1288,28 +1293,27 @@ def serialize(self) -> tuple[dict, list]: header: dict[Any, Any] = {} frames = [] - header["type-serialized"] = pickle.dumps(type(self)) try: - dtype, dtype_frames = self.dtype.serialize() + dtype, dtype_frames = self.dtype.device_serialize() header["dtype"] = dtype frames.extend(dtype_frames) header["dtype-is-cudf-serialized"] = True except AttributeError: - header["dtype"] = pickle.dumps(self.dtype) + header["dtype"] = self.dtype.str header["dtype-is-cudf-serialized"] = False if self.data is not None: - data_header, data_frames = self.data.serialize() + data_header, data_frames = self.data.device_serialize() header["data"] = data_header frames.extend(data_frames) if self.mask is not None: - mask_header, mask_frames = self.mask.serialize() + mask_header, mask_frames = self.mask.device_serialize() header["mask"] = mask_header frames.extend(mask_frames) if self.children: child_headers, child_frames = zip( - *(c.serialize() for c in self.children) + *(c.device_serialize() for c in self.children) ) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) @@ -1321,8 +1325,7 @@ def serialize(self) -> tuple[dict, list]: def deserialize(cls, header: dict, frames: list) -> ColumnBase: def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] - klass = pickle.loads(header["type-serialized"]) - obj = klass.deserialize(header, frames[:count]) + obj = cls.device_deserialize(header, frames[:count]) return obj, frames[count:] assert header["frame_count"] == len(frames), ( @@ -1332,7 +1335,7 @@ def unpack(header, frames) -> tuple[Any, list]: if header["dtype-is-cudf-serialized"]: dtype, frames = unpack(header["dtype"], frames) else: - dtype = pickle.loads(header["dtype"]) + dtype = np.dtype(header["dtype"]) if "data" in header: data, frames = unpack(header["data"], frames) else: @@ -1514,6 +1517,18 @@ def _return_sentinel_column(): ) return codes.fillna(na_sentinel.value) + def one_hot_encode( + self, categories: ColumnBase + ) -> abc.Generator[ColumnBase]: + plc_table = plc.transform.one_hot_encode( + self.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), + ) + return ( + type(self).from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ) + def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" @@ -2089,8 +2104,7 @@ def as_column( ) # Consider NaT as NA in the mask # but maintain NaT as a value - bool_mask = as_column(~is_nat) - mask = as_buffer(bools_to_mask(bool_mask)) + mask = as_column(~is_nat).as_mask() buffer = as_buffer(arbitrary.view("|u1")) col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype) if dtype: @@ -2260,8 +2274,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: ) return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": - col = as_column(cai_mask) - return bools_to_mask(col) + return as_column(cai_mask).as_mask() else: raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") @@ -2287,7 +2300,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: frames = [] if len(columns) > 0: - header_columns = [c.serialize() for c in columns] + header_columns: list[tuple[dict, list]] = [ + c.device_serialize() for c in columns + ] headers, column_frames = zip(*header_columns) for f in column_frames: frames.extend(f) @@ -2304,7 +2319,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: for meta in headers: col_frame_count = meta["frame_count"] - col_typ = pickle.loads(meta["type-serialized"]) + col_typ = Serializable._name_type_map[meta["type-serialized-name"]] colobj = col_typ.deserialize(meta, frames[:col_frame_count]) columns.append(colobj) # Advance frames diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 2c22724d3d7..9e6a73f1a9c 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -10,13 +10,12 @@ import numpy as np import pyarrow as pa +import pylibcudf as plc + import cudf -from cudf._lib.strings.convert.convert_fixed_point import ( - from_decimal as cpp_from_decimal, -) from cudf.api.types import is_scalar from cudf.core._internals import binaryop, unary -from cudf.core.buffer import as_buffer +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import ( @@ -89,7 +88,13 @@ def as_decimal_column( def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: - return cpp_from_decimal(self) + with acquire_spill_lock(): + plc_column = ( + plc.strings.convert.convert_fixed_point.from_fixed_point( + self.to_pylibcudf(mode="read"), + ) + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] else: return cast( cudf.core.column.StringColumn, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ea384888388..ba98e28f6a2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock @@ -256,7 +255,7 @@ def from_sequences( data=None, size=len(arbitrary), dtype=cudf.ListDtype(data_col.dtype), - mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), + mask=as_column(mask_col).as_mask(), offset=0, null_count=0, children=(offset_col, data_col), @@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn: # Separator strings to match the Python format separators = as_column([", ", "[", "]"]) - # Call libcudf to format the list column - return format_list_column(lc, separators) + with acquire_spill_lock(): + plc_column = plc.strings.convert.convert_lists.format_list_column( + lc.to_pylibcudf(mode="read"), + cudf.Scalar("None").device_value.c_value, + separators.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] def _transform_leaves(self, func, *args, **kwargs) -> Self: # return a new list column with the same nested structure diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 9514aaeab50..790cd6ea9bb 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -7,9 +7,10 @@ import numpy as np import pandas as pd +from numba.np import numpy_support from typing_extensions import Self -import pylibcudf +import pylibcudf as plc import cudf import cudf.core.column.column as column @@ -17,11 +18,13 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import binaryop, unary +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError +from cudf.utils import cudautils from cudf.utils.dtypes import ( find_common_type, min_column_type, @@ -179,13 +182,27 @@ def __setitem__(self, key: Any, value: Any): if out: self._mimic_inplace(out, inplace=True) + @acquire_spill_lock() + def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase: + plc_column = plc.transform.transform( + self.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True, + ) + return type(self).from_pylibcudf(plc_column) + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: if callable(unaryop): - return libcudf.transform.transform(self, unaryop) + nb_type = numpy_support.from_dtype(self.dtype) + nb_signature = (nb_type,) + compiled_op = cudautils.compile_udf(unaryop, nb_signature) + np_dtype = np.dtype(compiled_op[1]) + return self.transform(compiled_op, np_dtype) unaryop = unaryop.upper() unaryop = _unaryop_map.get(unaryop, unaryop) - unaryop = pylibcudf.unary.UnaryOperator[unaryop] + unaryop = plc.unary.UnaryOperator[unaryop] return unary.unary_operation(self, unaryop) def __invert__(self): @@ -298,8 +315,11 @@ def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. if self.dtype.kind != "f" or self.nan_count == 0: return self - newmask = libcudf.transform.nans_to_nulls(self) - return self.set_mask(newmask) + with acquire_spill_lock(): + mask, _ = plc.transform.nans_to_nulls( + self.to_pylibcudf(mode="read") + ) + return self.set_mask(as_buffer(mask)) def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar: if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 689c390fbdc..cf45734b746 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, cast import numpy as np @@ -246,12 +246,21 @@ def corr(self, other: NumericalBaseColumn) -> float: return cov / lhs_std / rhs_std def round( - self, decimals: int = 0, how: str = "half_even" + self, + decimals: int = 0, + how: Literal["half_even", "half_up"] = "half_even", ) -> NumericalBaseColumn: if not cudf.api.types.is_integer(decimals): - raise TypeError("Values in decimals must be integers") - """Round the values in the Column to the given number of decimals.""" - return libcudf.round.round(self, decimal_places=decimals, how=how) + raise TypeError("Argument 'decimals' must an integer") + if how not in {"half_even", "half_up"}: + raise ValueError(f"{how=} must be either 'half_even' or 'half_up'") + plc_how = plc.round.RoundingMethod[how.upper()] + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.round.round( + self.to_pylibcudf(mode="read"), decimals, plc_how + ) + ) def _scan(self, op: str) -> ColumnBase: return libcudf.reduce.scan( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c742b8e2c6e..36429acf7f6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,6 +19,7 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime +from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype @@ -44,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.numerical import NumericalColumn def str_to_boolean(column: StringColumn): @@ -1336,7 +1338,7 @@ def isinteger(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_integer(self._column)) + return self._return_or_inplace(self._column.is_integer()) def ishex(self) -> SeriesOrIndex: """ @@ -1468,7 +1470,7 @@ def isfloat(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_float(self._column)) + return self._return_or_inplace(self._column.is_float()) def isdecimal(self) -> SeriesOrIndex: """ @@ -2710,26 +2712,25 @@ def split( if len(str(pat)) <= 1: regex = False + result_table: StringColumn | dict[int, StringColumn] if expand: if self._column.null_count == len(self._column): result_table = {0: self._column.copy()} else: if regex is True: - data = libstrings.split_re(self._column, pat, n) + data = self._column.split_re(pat, n) else: - data = libstrings.split( - self._column, cudf.Scalar(pat, "str"), n - ) + data = self._column.split(cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: result_table = data else: if regex is True: - result_table = libstrings.split_record_re(self._column, pat, n) + result_table = self._column.split_record_re(pat, n) else: - result_table = libstrings.split_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = self._column.split_record( + cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2883,28 +2884,25 @@ def rsplit( if regex and isinstance(pat, re.Pattern): pat = pat.pattern + result_table: StringColumn | dict[int, StringColumn] if expand: if self._column.null_count == len(self._column): result_table = {0: self._column.copy()} else: if regex is True: - data = libstrings.rsplit_re(self._column, pat, n) + data = self._column.rsplit_re(pat, n) else: - data = libstrings.rsplit( - self._column, cudf.Scalar(pat, "str"), n - ) + data = self._column.rsplit(cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: result_table = data else: if regex is True: - result_table = libstrings.rsplit_record_re( - self._column, pat, n - ) + result_table = self._column.rsplit_record_re(pat, n) else: - result_table = libstrings.rsplit_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = self._column.rsplit_record( + cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2989,7 +2987,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str")), + self._column.partition(cudf.Scalar(sep, "str")), expand=expand, ) @@ -3054,7 +3052,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), + self._column.rpartition(cudf.Scalar(sep, "str")), expand=expand, ) @@ -4499,8 +4497,7 @@ def url_decode(self) -> SeriesOrIndex: 1 https://medium.com/rapids-ai dtype: object """ - - return self._return_or_inplace(libstrings.url_decode(self._column)) + return self._return_or_inplace(self._column.url_decode()) def url_encode(self) -> SeriesOrIndex: """ @@ -4531,7 +4528,7 @@ def url_encode(self) -> SeriesOrIndex: 1 https%3A%2F%2Fmedium.com%2Frapids-ai dtype: object """ - return self._return_or_inplace(libstrings.url_encode(self._column)) + return self._return_or_inplace(self._column.url_encode()) def code_points(self) -> SeriesOrIndex: """ @@ -5464,49 +5461,6 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582] - 1 [962346254] - dtype: list - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582, 403093213, 1258052021] - 1 [962346254, 677440381, 122618762] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, width) - ) - - def minhash_permuted( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: """ @@ -5538,7 +5492,7 @@ def minhash_permuted( >>> s = cudf.Series(['this is my', 'favorite book']) >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) - >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + >>> s.str.minhash(0, a=a, b=b, width=5) 0 [1305480171, 462824409, 74608232] 1 [32665388, 65330773, 97996158] dtype: list @@ -5554,53 +5508,10 @@ def minhash_permuted( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash_permuted( - self._column, seed, a_column, b_column, width - ) + libstrings.minhash(self._column, seed, a_column, b_column, width) ) def minhash64( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> str_series.str.minhash64(seeds) - 0 [3232308021562742685, 4445611509348165860, 586435843695903598] - 1 [23008204270530356, 1281229757012344693, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash64(self._column, seeds_column, width) - ) - - def minhash64_permuted( self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: """ @@ -5631,7 +5542,7 @@ def minhash64_permuted( >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) >>> a = cudf.Series([2, 3], dtype=np.uint64) >>> b = cudf.Series([5, 6], dtype=np.uint64) - >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + >>> s.str.minhash64(0, a=a, b=b, width=5) 0 [172452388517576012, 316595762085180527] 1 [71427536958126239, 58787297728258215] 2 [423885828176437114, 1140588505926961370] @@ -5648,79 +5559,7 @@ def minhash64_permuted( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64_permuted( - self._column, seed, a_column, b_column, width - ) - ) - - def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> ls.str.word_minhash(seeds=seeds) - 0 [21141582, 1232889953, 1268336794] - 1 [962346254, 2321233602, 1354839212] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash(self._column, seeds_column) - ) - - def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> ls.str.word_minhash64(seeds) - 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] - 1 [5240044617220523711, 5847101123925041457, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash64(self._column, seeds_column) + libstrings.minhash64(self._column, seed, a_column, b_column, width) ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -6015,13 +5854,13 @@ def as_numerical_column( out_dtype = cudf.api.types.dtype(dtype) string_col = self if out_dtype.kind in {"i", "u"}: - if not libstrings.is_integer(string_col).all(): + if not string_col.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - if not libstrings.is_float(string_col).all(): + if not string_col.is_float().all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." @@ -6099,10 +5938,17 @@ def as_timedelta_column( ) -> cudf.core.column.TimeDeltaColumn: return self.strptime(dtype, "%D days %H:%M:%S") # type: ignore[return-value] + @acquire_spill_lock() def as_decimal_column( self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": - return libstrings.to_decimal(self, dtype) + ) -> cudf.core.column.DecimalBaseColumn: + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + self.to_pylibcudf(mode="read"), + libcudf.types.dtype_to_pylibcudf_type(dtype), + ) + result = Column.from_pylibcudf(plc_column) + result.dtype.precision = dtype.precision # type: ignore[union-attr] + return result # type: ignore[return-value] def as_string_column(self) -> StringColumn: return self @@ -6138,12 +5984,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif ( - to_dtype.kind in {"i", "u"} - and not libstrings.is_integer(self).all() - ): + elif to_dtype.kind in {"i", "u"} and not self.is_integer().all(): return False - elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): + elif to_dtype.kind == "f" and not self.is_float().all(): return False else: return True @@ -6333,11 +6176,180 @@ def title(self) -> Self: def is_title(self) -> Self: return self._modify_characters(plc.strings.capitalize.is_title) + @acquire_spill_lock() def replace_multiple(self, pattern: Self, replacements: Self) -> Self: - with acquire_spill_lock(): - plc_result = plc.strings.replace.replace_multiple( - self.to_pylibcudf(mode="read"), - pattern.to_pylibcudf(mode="read"), - replacements.to_pylibcudf(mode="read"), + plc_result = plc.strings.replace.replace_multiple( + self.to_pylibcudf(mode="read"), + pattern.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + ) + return cast(Self, Column.from_pylibcudf(plc_result)) + + @acquire_spill_lock() + def _split_record_re( + self, + pattern: str, + maxsplit: int, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram, int], + plc.Column, + ], + ) -> Self: + plc_column = method( + self.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pattern, + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, + ) + return cast(Self, Column.from_pylibcudf(plc_column)) + + def split_record_re(self, pattern: str, maxsplit: int) -> Self: + return self._split_record_re( + pattern, maxsplit, plc.strings.split.split.split_record_re + ) + + def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self: + return self._split_record_re( + pattern, maxsplit, plc.strings.split.split.rsplit_record_re + ) + + @acquire_spill_lock() + def _split_re( + self, + pattern: str, + maxsplit: int, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram, int], + plc.Table, + ], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pattern, + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() ) - return cast(Self, Column.from_pylibcudf(plc_result)) + ) + + def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: + return self._split_re( + pattern, maxsplit, plc.strings.split.split.split_re + ) + + def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: + return self._split_re( + pattern, maxsplit, plc.strings.split.split.rsplit_re + ) + + @acquire_spill_lock() + def _split_record( + self, + delimiter: cudf.Scalar, + maxsplit: int, + method: Callable[[plc.Column, plc.Scalar, int], plc.Column], + ) -> Self: + plc_column = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + maxsplit, + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + return self._split_record( + delimiter, maxsplit, plc.strings.split.split.split_record + ) + + def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + return self._split_record( + delimiter, maxsplit, plc.strings.split.split.rsplit_record + ) + + @acquire_spill_lock() + def _split( + self, + delimiter: cudf.Scalar, + maxsplit: int, + method: Callable[[plc.Column, plc.Scalar, int], plc.Column], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + maxsplit, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() + ) + ) + + def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + return self._split(delimiter, maxsplit, plc.strings.split.split.split) + + def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) + + @acquire_spill_lock() + def _partition( + self, + delimiter: cudf.Scalar, + method: Callable[[plc.Column, plc.Scalar], plc.Column], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() + ) + ) + + def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + return self._partition( + delimiter, plc.strings.split.partition.partition + ) + + def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + return self._partition( + delimiter, plc.strings.split.partition.rpartition + ) + + @acquire_spill_lock() + def url_decode(self) -> Self: + plc_column = plc.strings.convert.convert_urls.url_decode( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def url_encode(self) -> Self: + plc_column = plc.strings.convert.convert_urls.url_encode( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def is_integer(self) -> NumericalColumn: + plc_column = plc.strings.convert.convert_integers.is_integer( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def is_float(self) -> NumericalColumn: + plc_column = plc.strings.convert.convert_floats.is_float( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 25bab9bd371..eef03571100 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7,7 +7,6 @@ import itertools import numbers import os -import pickle import re import sys import textwrap @@ -50,7 +49,6 @@ ) from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column import ( CategoricalColumn, @@ -588,7 +586,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer): pass -class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): +class DataFrame(IndexedFrame, GetAttrGetItemMixin): """ A GPU Dataframe object. @@ -1183,7 +1181,7 @@ def _constructor_expanddim(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -1198,8 +1196,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj @@ -6761,9 +6758,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) result = column.as_column(result, dtype=result_dtype) if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) + result = result.set_mask(mask._column.as_mask()) return Series._from_column(result, index=self.index) else: result_df = DataFrame(result, index=self.index) @@ -7870,6 +7865,16 @@ def interleave_columns(self): ) return self._constructor_sliced._from_column(result_col) + @acquire_spill_lock() + def _compute_columns(self, expr: str) -> ColumnBase: + plc_column = plc.transform.compute_column( + plc.Table( + [col.to_pylibcudf(mode="read") for col in self._columns] + ), + plc.expressions.to_expression(expr, self._column_names), + ) + return libcudf.column.Column.from_pylibcudf(plc_column) + @_performance_tracking def eval(self, expr: str, inplace: bool = False, **kwargs): """Evaluate a string describing operations on DataFrame columns. @@ -7997,11 +8002,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): raise ValueError( "Cannot operate inplace if there is no assignment" ) - return Series._from_column( - libcudf.transform.compute_column( - [*self._columns], self._column_names, statements[0] - ) - ) + return Series._from_column(self._compute_columns(statements[0])) targets = [] exprs = [] @@ -8017,15 +8018,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): targets.append(t.strip()) exprs.append(e.strip()) - cols = ( - libcudf.transform.compute_column( - [*self._columns], self._column_names, e - ) - for e in exprs - ) ret = self if inplace else self.copy(deep=False) - for name, col in zip(targets, cols): - ret._data[name] = col + for name, expr in zip(targets, exprs): + ret._data[name] = self._compute_columns(expr) if not inplace: return ret diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index aa601a2b322..a798041699e 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -799,8 +799,7 @@ def _set_missing_values( valid_mask = _ensure_gpu_buffer( valid_mask[0], valid_mask[1], allow_copy ) - boolmask = as_column(valid_mask._buf, dtype="bool") - bitmask = cudf._lib.transform.bools_to_mask(boolmask) + bitmask = as_column(valid_mask._buf, dtype="bool").as_mask() return cudf_col.set_mask(bitmask) elif null == _MaskKind.BITMASK: valid_mask = _ensure_gpu_buffer( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a44ad91e499..971f0be77f8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -3,7 +3,6 @@ import decimal import operator -import pickle import textwrap import warnings from functools import cached_property @@ -91,13 +90,13 @@ def dtype(arbitrary): raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype") -def _decode_type( +def _check_type( cls: type, header: dict, frames: list, is_valid_class: Callable[[type, type], bool] = operator.is_, -) -> tuple[dict, list, type]: - """Decode metadata-encoded type and check validity +) -> None: + """Perform metadata-encoded type and check validity Parameters ---------- @@ -112,12 +111,6 @@ class performing deserialization serialization by `cls` (default is to check type equality), called as `is_valid_class(decoded_class, cls)`. - Returns - ------- - tuple - Tuple of validated headers, frames, and the decoded class - constructor. - Raises ------ AssertionError @@ -128,11 +121,11 @@ class performing deserialization f"Deserialization expected {header['frame_count']} frames, " f"but received {len(frames)}." ) - klass = pickle.loads(header["type-serialized"]) + klass = Serializable._name_type_map[header["type-serialized-name"]] assert is_valid_class( - klass, cls + klass, + cls, ), f"Header-encoded {klass=} does not match decoding {cls=}." - return header, frames, klass class _BaseDtype(ExtensionDtype, Serializable): @@ -303,13 +296,14 @@ def construct_from_string(self): def serialize(self): header = {} - header["type-serialized"] = pickle.dumps(type(self)) header["ordered"] = self.ordered frames = [] if self.categories is not None: - categories_header, categories_frames = self.categories.serialize() + categories_header, categories_frames = ( + self.categories.device_serialize() + ) header["categories"] = categories_header frames.extend(categories_frames) header["frame_count"] = len(frames) @@ -317,15 +311,14 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) ordered = header["ordered"] categories_header = header["categories"] categories_frames = frames - categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize( + categories = Serializable.device_deserialize( categories_header, categories_frames ) - return klass(categories=categories, ordered=ordered) + return cls(categories=categories, ordered=ordered) def __repr__(self): return self.to_pandas().__repr__() @@ -493,12 +486,13 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Dtype] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames = [] if isinstance(self.element_type, _BaseDtype): - header["element-type"], frames = self.element_type.serialize() + header["element-type"], frames = ( + self.element_type.device_serialize() + ) else: header["element-type"] = getattr( self.element_type, "name", self.element_type @@ -508,14 +502,14 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) if isinstance(header["element-type"], dict): - element_type = pickle.loads( - header["element-type"]["type-serialized"] - ).deserialize(header["element-type"], frames) + element_type = Serializable.device_deserialize( + header["element-type"], frames + ) else: element_type = header["element-type"] - return klass(element_type=element_type) + return cls(element_type=element_type) @cached_property def itemsize(self): @@ -639,7 +633,6 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames: list[Buffer] = [] @@ -647,33 +640,31 @@ def serialize(self) -> tuple[dict, list]: for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() fields[k] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) else: - fields[k] = pickle.dumps(dtype) + fields[k] = dtype.str header["fields"] = fields header["frame_count"] = len(frames) return header, frames @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) fields = {} for k, dtype in header["fields"].items(): if isinstance(dtype, tuple): dtype_header, (start, stop) = dtype - fields[k] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize( + fields[k] = Serializable.device_deserialize( dtype_header, frames[start:stop], ) else: - fields[k] = pickle.loads(dtype) + fields[k] = np.dtype(dtype) return cls(fields) @cached_property @@ -836,7 +827,6 @@ def _from_decimal(cls, decimal): def serialize(self) -> tuple[dict, list]: return ( { - "type-serialized": pickle.dumps(type(self)), "precision": self.precision, "scale": self.scale, "frame_count": 0, @@ -846,11 +836,8 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type( - cls, header, frames, is_valid_class=issubclass - ) - klass = pickle.loads(header["type-serialized"]) - return klass(header["precision"], header["scale"]) + _check_type(cls, header, frames, is_valid_class=issubclass) + return cls(header["precision"], header["scale"]) def __eq__(self, other: Dtype) -> bool: if other is self: @@ -958,18 +945,17 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header = { - "type-serialized": pickle.dumps(type(self)), - "fields": pickle.dumps((self.subtype, self.closed)), + "fields": (self.subtype.str, self.closed), "frame_count": 0, } return header, [] @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - klass = pickle.loads(header["type-serialized"]) - subtype, closed = pickle.loads(header["fields"]) - return klass(subtype, closed=closed) + _check_type(cls, header, frames) + subtype, closed = header["fields"] + subtype = np.dtype(subtype) + return cls(subtype, closed=closed) def _is_categorical_dtype(obj): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0a7e6fefe6e..00199cca828 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,7 +3,6 @@ from __future__ import annotations import operator -import pickle import warnings from collections import abc from typing import TYPE_CHECKING, Any, Literal @@ -24,6 +23,7 @@ from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core._internals.search import search_sorted +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -47,7 +47,7 @@ # TODO: It looks like Frame is missing a declaration of `copy`, need to add -class Frame(BinaryOperand, Scannable): +class Frame(BinaryOperand, Scannable, Serializable): """A collection of Column objects with an optional index. Parameters @@ -97,37 +97,80 @@ def ndim(self) -> int: @_performance_tracking def serialize(self): # TODO: See if self._data can be serialized outright + frames = [] header = { - "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(self._column_names), - "column_rangeindex": pickle.dumps(self._data.rangeindex), - "column_multiindex": pickle.dumps(self._data.multiindex), - "column_label_dtype": pickle.dumps(self._data.label_dtype), - "column_level_names": pickle.dumps(self._data._level_names), + "column_label_dtype": None, + "dtype-is-cudf-serialized": False, } - header["columns"], frames = serialize_columns(self._columns) + if (label_dtype := self._data.label_dtype) is not None: + try: + header["column_label_dtype"], frames = ( + label_dtype.device_serialize() + ) + header["dtype-is-cudf-serialized"] = True + except AttributeError: + header["column_label_dtype"] = label_dtype.str + + header["columns"], column_frames = serialize_columns(self._columns) + column_names, column_names_numpy_type = ( + zip( + *[ + (cname.item(), type(cname).__name__) + if isinstance(cname, np.generic) + else (cname, "") + for cname in self._column_names + ] + ) + if self._column_names + else ((), ()) + ) + header |= { + "column_names": column_names, + "column_names_numpy_type": column_names_numpy_type, + "column_rangeindex": self._data.rangeindex, + "column_multiindex": self._data.multiindex, + "column_level_names": self._data._level_names, + } + frames.extend(column_frames) + return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): - cls_deserialize = pickle.loads(header["type-serialized"]) - column_names = pickle.loads(header["column_names"]) - columns = deserialize_columns(header["columns"], frames) kwargs = {} + dtype_header = header["column_label_dtype"] + if header["dtype-is-cudf-serialized"]: + count = dtype_header["frame_count"] + kwargs["label_dtype"] = cls.device_deserialize( + header, frames[:count] + ) + frames = frames[count:] + else: + kwargs["label_dtype"] = ( + np.dtype(dtype_header) if dtype_header is not None else None + ) + + columns = deserialize_columns(header["columns"], frames) for metadata in [ "rangeindex", "multiindex", - "label_dtype", "level_names", ]: key = f"column_{metadata}" if key in header: - kwargs[metadata] = pickle.loads(header[key]) + kwargs[metadata] = header[key] + + column_names = [ + getattr(np, cntype)(cname) if cntype != "" else cname + for cname, cntype in zip( + header["column_names"], header["column_names_numpy_type"] + ) + ] col_accessor = ColumnAccessor( data=dict(zip(column_names, columns)), **kwargs ) - return cls_deserialize._from_data(col_accessor) + return cls._from_data(col_accessor) @classmethod @_performance_tracking @@ -1457,7 +1500,14 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode(list(self._columns)) + plc_table, plc_column = plc.transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in self._columns]) + ) + columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] + indices = libcudf.column.Column.from_pylibcudf(plc_column) keys = self._from_columns_like_self(columns) return keys, indices diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index cdb1e0702a3..a8d82f977d5 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -3,7 +3,6 @@ import copy import itertools -import pickle import textwrap import warnings from collections import abc @@ -1280,7 +1279,7 @@ def serialize(self): obj_header, obj_frames = self.obj.serialize() header["obj"] = obj_header - header["obj_type"] = pickle.dumps(type(self.obj)) + header["obj_type_name"] = type(self.obj).__name__ header["num_obj_frames"] = len(obj_frames) frames.extend(obj_frames) @@ -1295,7 +1294,7 @@ def serialize(self): def deserialize(cls, header, frames): kwargs = header["kwargs"] - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) @@ -3328,8 +3327,8 @@ def _handle_misc(self, by): def serialize(self): header = {} frames = [] - header["names"] = pickle.dumps(self.names) - header["_named_columns"] = pickle.dumps(self._named_columns) + header["names"] = self.names + header["_named_columns"] = self._named_columns column_header, column_frames = cudf.core.column.serialize_columns( self._key_columns ) @@ -3339,8 +3338,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index fee844f0bd5..8d3ef1036d1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3,7 +3,6 @@ from __future__ import annotations import operator -import pickle import warnings from collections.abc import Hashable, MutableMapping from functools import cache, cached_property @@ -497,9 +496,8 @@ def serialize(self): header["index_column"]["step"] = self.step frames = [] - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) + header["name"] = self.name + header["dtype"] = self.dtype.str header["frame_count"] = 0 return header, frames @@ -507,11 +505,14 @@ def serialize(self): @_performance_tracking def deserialize(cls, header, frames): h = header["index_column"] - name = pickle.loads(header["name"]) + name = header["name"] start = h["start"] stop = h["stop"] step = h.get("step", 1) - return RangeIndex(start=start, stop=stop, step=step, name=name) + dtype = np.dtype(header["dtype"]) + return RangeIndex( + start=start, stop=stop, step=step, dtype=dtype, name=name + ) @property # type: ignore @_performance_tracking diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3b9a92be9c9..81d954960e2 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3507,7 +3507,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): col = _post_process_output_col(ans_col, retty) - col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) + col.set_base_mask(ans_mask.as_mask()) result = cudf.Series._from_column(col, index=self.index) return result @@ -3969,7 +3969,13 @@ def round(self, decimals=0, how="half_even"): cols = ( col.round(decimals[name], how=how) - if name in decimals and col.dtype.kind in "fiu" + if name in decimals + and ( + col.dtype.kind in "fiu" + or isinstance( + col.dtype, (cudf.Decimal32Dtype, cudf.Decimal64Dtype) + ) + ) else col.copy(deep=True) for name, col in self._column_labels_and_values ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 173d4e1c584..5a41a33e583 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -5,7 +5,6 @@ import itertools import numbers import operator -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any @@ -921,15 +920,15 @@ def take(self, indices) -> Self: def serialize(self): header, frames = super().serialize() # Overwrite the names in _data with the true names. - header["column_names"] = pickle.dumps(self.names) + header["column_names"] = self.names return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): # Spoof the column names to construct the frame, then set manually. - column_names = pickle.loads(header["column_names"]) - header["column_names"] = pickle.dumps(range(0, len(column_names))) + column_names = header["column_names"] + header["column_names"] = range(0, len(column_names)) obj = super().deserialize(header, frames) return obj._set_names(column_names) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index d95d252559f..391ee31f125 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -15,7 +15,6 @@ # limitations under the License. from __future__ import annotations -import pickle import warnings from typing import TYPE_CHECKING @@ -26,6 +25,7 @@ import cudf from cudf._lib.column import Column +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.groupby.groupby import ( DataFrameGroupBy, @@ -97,21 +97,21 @@ def serialize(self): header, frames = super().serialize() grouping_head, grouping_frames = self.grouping.serialize() header["grouping"] = grouping_head - header["resampler_type"] = pickle.dumps(type(self)) + header["resampler_type"] = type(self).__name__ header["grouping_frames_count"] = len(grouping_frames) frames.extend(grouping_frames) return header, frames @classmethod def deserialize(cls, header, frames): - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) grouping = _ResampleGrouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) - resampler_cls = pickle.loads(header["resampler_type"]) + resampler_cls = Serializable._name_type_map[header["resampler_type"]] out = resampler_cls.__new__(resampler_cls) out.grouping = grouping super().__init__(out, obj, by=grouping) @@ -163,8 +163,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames[: -header["__bin_labels_count"]] ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 84c653c5b3f..59a3e9dbf3b 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,7 +12,6 @@ import cudf from cudf._lib.column import Column -from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar @@ -1338,7 +1337,11 @@ def _one_hot_encode_column( f"np.iinfo({size_type_dtype}).max. Consider reducing " "size of category" ) - data = one_hot_encode(column, categories) + result_labels = ( + x if x is not None else "" + for x in categories.to_arrow().to_pylist() + ) + data = dict(zip(result_labels, column.one_hot_encode(categories))) if drop_first and len(data): data.pop(next(iter(data))) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 58cefc6554e..647e20fc16b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4,7 +4,6 @@ import functools import inspect -import pickle import textwrap import warnings from collections import abc @@ -17,7 +16,6 @@ from typing_extensions import Self, assert_never import cudf -from cudf import _lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -28,7 +26,6 @@ ) from cudf.core import indexing_utils from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -415,7 +412,7 @@ def _loc_to_iloc(self, arg): return indices -class Series(SingleColumnFrame, IndexedFrame, Serializable): +class Series(SingleColumnFrame, IndexedFrame): """ One-dimensional GPU array (including time series). @@ -526,7 +523,7 @@ def from_categorical(cls, categorical, codes=None): mask = None if not valid_codes.all(): - mask = libcudf.transform.bools_to_mask(valid_codes) + mask = valid_codes.as_mask() col = CategoricalColumn( data=col.data, size=codes.size, @@ -900,7 +897,7 @@ def hasnans(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -916,8 +913,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 80ee078917a..8be336021b1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -15,9 +15,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) from cudf.api.types import is_integer, is_scalar from cudf.core import column from cudf.core.buffer import acquire_spill_lock @@ -232,7 +229,7 @@ def to_datetime( ) break elif arg_col.dtype.kind == "O": - if not cpp_is_integer(arg_col).all(): + if not arg_col.is_integer().all(): col = new_series._column.strptime( cudf.dtype("datetime64[ns]"), format=format ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 91f23490031..40348461f8c 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -2,14 +2,13 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd import cudf from cudf import _lib as libcudf -from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core._internals import unary from cudf.core.column import as_column @@ -18,10 +17,16 @@ from cudf.utils.dtypes import can_convert_to_column if TYPE_CHECKING: - from cudf.core.column import ColumnBase + from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.string import StringColumn -def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): +def to_numeric( + arg, + errors: Literal["raise", "coerce", "ignore"] = "raise", + downcast: Literal["integer", "signed", "unsigned", "float", None] = None, + dtype_backend=None, +): """ Convert argument into numerical types. @@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): else: try: col = _convert_str_col( - col._get_decategorized_column(), errors, downcast + col._get_decategorized_column(), # type: ignore[attr-defined] + errors, + downcast, ) except ValueError as e: if errors == "ignore": @@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): raise e elif is_string_dtype(dtype): try: - col = _convert_str_col(col, errors, downcast) + col = _convert_str_col(col, errors, downcast) # type: ignore[arg-type] except ValueError as e: if errors == "ignore": return arg @@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): return col.values -def _convert_str_col(col, errors, _downcast=None): +def _convert_str_col( + col: StringColumn, + errors: Literal["raise", "coerce", "ignore"], + _downcast: Literal["integer", "signed", "unsigned", "float", None] = None, +) -> NumericalColumn: """ Converts a string column to numeric column @@ -212,13 +223,21 @@ def _convert_str_col(col, errors, _downcast=None): if not is_string_dtype(col): raise TypeError("col must be string dtype.") - is_integer = libstrings.is_integer(col) - if is_integer.all(): - return col.astype(dtype=cudf.dtype("i8")) + if col.is_integer().all(): + return col.astype(dtype=cudf.dtype("i8")) # type: ignore[return-value] - col = _proc_inf_empty_strings(col) + # TODO: This can be handled by libcudf in + # future see StringColumn.as_numerical_column + converted_col = ( + col.to_lower() + .find_and_replace(as_column([""]), as_column(["NaN"])) + .replace_multiple( + as_column(["+", "inf", "inity"]), # type: ignore[arg-type] + as_column(["", "Inf", ""]), # type: ignore[arg-type] + ) + ) - is_float = libstrings.is_float(col) + is_float = converted_col.is_float() if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( @@ -227,27 +246,14 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.astype(dtype=cudf.dtype("float32")) + return converted_col.astype(dtype=cudf.dtype("float32")) # type: ignore[return-value] else: - return col.astype(dtype=cudf.dtype("float64")) + return converted_col.astype(dtype=cudf.dtype("float64")) # type: ignore[return-value] else: if errors == "coerce": - col = libcudf.string_casting.stod(col) + converted_col = libcudf.string_casting.stod(converted_col) non_numerics = is_float.unary_operator("not") - col[non_numerics] = None - return col + converted_col[non_numerics] = None + return converted_col # type: ignore[return-value] else: raise ValueError("Unable to convert some strings to numerics.") - - -def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: - """Handles empty and infinity strings""" - col = col.to_lower() # type: ignore[attr-defined] - col = col.find_and_replace(as_column([""]), as_column(["NaN"])) - # TODO: This can be handled by libcudf in - # future see StringColumn.as_numerical_column - col = col.replace_multiple( # type: ignore[attr-defined] - as_column(["+", "inf", "inity"]), - as_column(["", "Inf", ""]), - ) - return col diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index e8d634598f4..a91a4951306 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -4,7 +4,6 @@ import pandas as pd import cudf -from cudf._lib.transform import bools_to_mask __all__ = ["randomdata", "timeseries"] @@ -70,7 +69,7 @@ def timeseries( size=len(index), p=[1 - nulls_frequency, nulls_frequency], ) - mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) + mask_buf = cudf.core.column.as_column(mask).as_mask() masked_col = gdf[col]._column.set_mask(mask_buf) gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 68b60809bb9..5616413b7e4 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,147 +1,28 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations -import datetime +import itertools import warnings +from typing import TYPE_CHECKING, Literal import pyarrow as pa +import pylibcudf as plc + import cudf -from cudf._lib import orc as liborc +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf._lib.utils import data_from_pylibcudf_io from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils +try: + import ujson as json # type: ignore[import-untyped] +except ImportError: + import json -def _make_empty_df(filepath_or_buffer, columns): - from pyarrow import orc - - orc_file = orc.ORCFile(filepath_or_buffer) - schema = orc_file.schema - col_names = schema.names if columns is None else columns - return cudf.DataFrame._from_data( - data={ - col_name: cudf.core.column.column_empty( - row_count=0, - dtype=schema.field(col_name).type.to_pandas_dtype(), - ) - for col_name in col_names - } - ) - - -def _parse_column_statistics(cs, column_statistics_blob): - # Initialize stats to return and parse stats blob - column_statistics = {} - cs.ParseFromString(column_statistics_blob) - - # Load from parsed stats blob into stats to return - if cs.HasField("numberOfValues"): - column_statistics["number_of_values"] = cs.numberOfValues - if cs.HasField("hasNull"): - column_statistics["has_null"] = cs.hasNull - - if cs.HasField("intStatistics"): - column_statistics["minimum"] = ( - cs.intStatistics.minimum - if cs.intStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.intStatistics.maximum - if cs.intStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = ( - cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None - ) - - elif cs.HasField("doubleStatistics"): - column_statistics["minimum"] = ( - cs.doubleStatistics.minimum - if cs.doubleStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.doubleStatistics.maximum - if cs.doubleStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = ( - cs.doubleStatistics.sum - if cs.doubleStatistics.HasField("sum") - else None - ) - - elif cs.HasField("stringStatistics"): - column_statistics["minimum"] = ( - cs.stringStatistics.minimum - if cs.stringStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.stringStatistics.maximum - if cs.stringStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = cs.stringStatistics.sum - - elif cs.HasField("bucketStatistics"): - column_statistics["true_count"] = cs.bucketStatistics.count[0] - column_statistics["false_count"] = ( - column_statistics["number_of_values"] - - column_statistics["true_count"] - ) - - elif cs.HasField("decimalStatistics"): - column_statistics["minimum"] = ( - cs.decimalStatistics.minimum - if cs.decimalStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.decimalStatistics.maximum - if cs.decimalStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = cs.decimalStatistics.sum - - elif cs.HasField("dateStatistics"): - column_statistics["minimum"] = ( - datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), - datetime.timezone.utc, - ) - if cs.dateStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), - datetime.timezone.utc, - ) - if cs.dateStatistics.HasField("maximum") - else None - ) - - elif cs.HasField("timestampStatistics"): - # Before ORC-135, the local timezone offset was included and they were - # stored as minimum and maximum. After ORC-135, the timestamp is - # adjusted to UTC before being converted to milliseconds and stored - # in minimumUtc and maximumUtc. - # TODO: Support minimum and maximum by reading writer's local timezone - if cs.timestampStatistics.HasField( - "minimumUtc" - ) and cs.timestampStatistics.HasField("maximumUtc"): - column_statistics["minimum"] = datetime.datetime.fromtimestamp( - cs.timestampStatistics.minimumUtc / 1000, datetime.timezone.utc - ) - column_statistics["maximum"] = datetime.datetime.fromtimestamp( - cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc - ) - - elif cs.HasField("binaryStatistics"): - column_statistics["sum"] = cs.binaryStatistics.sum - - return column_statistics +if TYPE_CHECKING: + from cudf.core.column import ColumnBase @ioutils.doc_read_orc_metadata() @@ -175,11 +56,12 @@ def read_orc_statistics( path_or_buf = ioutils._select_single_source( path_or_buf, "read_orc_statistics" ) - ( - column_names, - parsed_file_statistics, - parsed_stripes_statistics, - ) = liborc.read_parsed_orc_statistics(path_or_buf) + parsed = plc.io.orc.read_parsed_orc_statistics( + plc.io.SourceInfo([path_or_buf]) + ) + column_names = parsed.column_names + parsed_file_statistics = parsed.file_stats + parsed_stripes_statistics = parsed.stripes_stats # Parse file statistics file_statistics = { @@ -273,16 +155,14 @@ def read_orc( columns=None, filters=None, stripes=None, - skiprows=None, - num_rows=None, - use_index=True, + skiprows: int | None = None, + num_rows: int | None = None, + use_index: bool = True, timestamp_type=None, storage_options=None, bytes_per_thread=None, ): """{docstring}""" - from cudf import DataFrame - if skiprows is not None: # Do not remove until cuIO team approves its removal. warnings.warn( @@ -329,31 +209,132 @@ def read_orc( # Return empty if everything was filtered if len(selected_stripes) == 0: - return _make_empty_df(filepaths_or_buffers[0], columns) + from pyarrow import orc + + orc_file = orc.ORCFile(filepaths_or_buffers[0]) + schema = orc_file.schema + col_names = schema.names if columns is None else columns + return cudf.DataFrame._from_data( + data={ + col_name: cudf.core.column.column_empty( + row_count=0, + dtype=schema.field(col_name).type.to_pandas_dtype(), + ) + for col_name in col_names + } + ) else: stripes = selected_stripes if engine == "cudf": - return DataFrame._from_data( - *liborc.read_orc( - filepaths_or_buffers, - columns, - stripes, - skiprows, - num_rows, - use_index, - timestamp_type, - ) + if columns is not None: + columns = [str(col) for col in columns] + + if skiprows is None: + skiprows = 0 + elif not isinstance(skiprows, int) or skiprows < 0: + raise TypeError("skiprows must be an int >= 0") + + if num_rows is None: + num_rows = -1 + elif not isinstance(num_rows, int) or num_rows < -1: + raise TypeError("num_rows must be an int >= -1") + + tbl_w_meta = plc.io.orc.read_orc( + plc.io.SourceInfo(filepaths_or_buffers), + columns, + stripes, + skiprows, + num_rows, + use_index, + dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)), ) + + if isinstance(columns, list) and len(columns) == 0: + # When `columns=[]`, index needs to be + # established, but not the columns. + nrows = tbl_w_meta.tbl.num_rows() + data = {} + index = cudf.RangeIndex(nrows) + else: + names = tbl_w_meta.column_names(include_children=False) + index_col = None + is_range_index = False + reset_index_name = False + range_idx = None + + if len(tbl_w_meta.per_file_user_data) > 0: + json_str = ( + tbl_w_meta.per_file_user_data[0] + .get(b"pandas", b"") + .decode("utf-8") + ) + if json_str != "": + meta = json.loads(json_str) + if ( + "index_columns" in meta + and len(meta["index_columns"]) > 0 + ): + index_col = meta["index_columns"] + if ( + isinstance(index_col[0], dict) + and index_col[0]["kind"] == "range" + ): + is_range_index = True + else: + index_col_names = {} + for idx_col in index_col: + for c in meta["columns"]: + if c["field_name"] == idx_col: + index_col_names[idx_col] = ( + c["name"] or c["field_name"] + ) + if c["name"] is None: + reset_index_name = True + + actual_index_names = None + col_names = names + if index_col is not None and len(index_col) > 0: + if is_range_index: + range_index_meta = index_col[0] + range_idx = cudf.RangeIndex( + start=range_index_meta["start"], + stop=range_index_meta["stop"], + step=range_index_meta["step"], + name=range_index_meta["name"], + ) + if skiprows != 0: + range_idx = range_idx[skiprows:] + if num_rows != -1: + range_idx = range_idx[:num_rows] + else: + actual_index_names = list(index_col_names.values()) + col_names = names[len(actual_index_names) :] + + data, index = data_from_pylibcudf_io( + tbl_w_meta, + col_names if columns is None else names, + actual_index_names, + ) + + if is_range_index: + index = range_idx + elif reset_index_name: + index.names = [None] * len(index.names) + + child_name_values = tbl_w_meta.child_names.values() + + data = { + name: ioutils._update_col_struct_field_names(col, child_names) + for (name, col), child_names in zip( + data.items(), child_name_values + ) + } + + return cudf.DataFrame._from_data(data, index=index) else: from pyarrow import orc - def read_orc_stripe(orc_file, stripe, columns): - pa_table = orc_file.read_stripe(stripe, columns) - if isinstance(pa_table, pa.RecordBatch): - pa_table = pa.Table.from_batches([pa_table]) - return pa_table - warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( @@ -364,11 +345,18 @@ def read_orc_stripe(orc_file, stripe, columns): orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: - pa_tables = [ - read_orc_stripe(orc_file, i, columns) + pa_tables = ( + orc_file.read_stripe(i, columns) for i in stripe_source_file - ] - pa_table = pa.concat_tables(pa_tables) + ) + pa_table = pa.concat_tables( + [ + pa.Table.from_batches([table]) + if isinstance(table, pa.RecordBatch) + else table + for table in pa_tables + ] + ) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) @@ -378,16 +366,18 @@ def read_orc_stripe(orc_file, stripe, columns): @ioutils.doc_to_orc() def to_orc( - df, + df: cudf.DataFrame, fname, - compression="snappy", - statistics="ROWGROUP", - stripe_size_bytes=None, - stripe_size_rows=None, - row_index_stride=None, + compression: Literal[ + False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4" + ] = "SNAPPY", + statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP", + stripe_size_bytes: int | None = None, + stripe_size_rows: int | None = None, + row_index_stride: int | None = None, cols_as_map_type=None, storage_options=None, - index=None, + index: bool | None = None, ): """{docstring}""" @@ -413,7 +403,7 @@ def to_orc( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - liborc.write_orc( + _plc_write_orc( df, file_obj, compression, @@ -425,7 +415,7 @@ def to_orc( index, ) else: - liborc.write_orc( + _plc_write_orc( df, path_or_buf, compression, @@ -438,4 +428,279 @@ def to_orc( ) -ORCWriter = liborc.ORCWriter +@acquire_spill_lock() +def _plc_write_orc( + table: cudf.DataFrame, + path_or_buf, + compression: Literal[ + False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4" + ] = "SNAPPY", + statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP", + stripe_size_bytes: int | None = None, + stripe_size_rows: int | None = None, + row_index_stride: int | None = None, + cols_as_map_type=None, + index: bool | None = None, +) -> None: + """ + See `cudf::io::write_orc`. + + See Also + -------- + cudf.read_orc + """ + user_data = {"pandas": ioutils.generate_pandas_metadata(table, index)} + if index is True or ( + index is None and not isinstance(table.index, cudf.RangeIndex) + ): + columns = ( + table._columns + if table.index is None + else itertools.chain(table.index._columns, table._columns) + ) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table._index.names): + tbl_meta.column_metadata[level].set_name( + ioutils._index_level_name(idx_name, level, table._column_names) # type: ignore[arg-type] + ) + num_index_cols_meta = len(table.index.names) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + num_index_cols_meta = 0 + + has_map_type = False + if cols_as_map_type is not None: + cols_as_map_type = set(cols_as_map_type) + has_map_type = True + + for i, (name, col) in enumerate( + table._column_labels_and_values, start=num_index_cols_meta + ): + tbl_meta.column_metadata[i].set_name(name) + _set_col_children_metadata( + col, + tbl_meta.column_metadata[i], + has_map_type and name in cols_as_map_type, + ) + + options = ( + plc.io.orc.OrcWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(_get_comp_type(compression)) + .enable_statistics(_get_orc_stat_freq(statistics)) + .build() + ) + if stripe_size_bytes is not None: + options.set_stripe_size_bytes(stripe_size_bytes) + if stripe_size_rows is not None: + options.set_stripe_size_rows(stripe_size_rows) + if row_index_stride is not None: + options.set_row_index_stride(row_index_stride) + + plc.io.orc.write_orc(options) + + +class ORCWriter: + """ + ORCWriter lets you you incrementally write out a ORC file from a series + of cudf tables + + See Also + -------- + cudf.io.orc.to_orc + """ + + def __init__( + self, + path, + index: bool | None = None, + compression: Literal[ + False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4" + ] = "SNAPPY", + statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP", + cols_as_map_type=None, + stripe_size_bytes: int | None = None, + stripe_size_rows: int | None = None, + row_index_stride: int | None = None, + ): + self.sink = plc.io.SinkInfo([path]) + self.statistics = statistics + self.compression = compression + self.index = index + self.cols_as_map_type = ( + cols_as_map_type + if cols_as_map_type is None + else set(cols_as_map_type) + ) + self.stripe_size_bytes = stripe_size_bytes + self.stripe_size_rows = stripe_size_rows + self.row_index_stride = row_index_stride + self.initialized = False + + def write_table(self, table): + """Writes a single table to the file""" + if not self.initialized: + self._initialize_chunked_state(table) + + keep_index = self.index is not False and ( + table.index.name is not None + or isinstance(table.index, cudf.MultiIndex) + ) + if keep_index: + cols_to_write = itertools.chain( + table.index._columns, table._columns + ) + else: + cols_to_write = table._columns + + self.writer.write( + plc.Table([col.to_pylibcudf(mode="read") for col in cols_to_write]) + ) + + def close(self): + if not self.initialized: + return + self.writer.close() + + def _initialize_chunked_state(self, table): + """ + Prepare all the values required to build the + chunked_orc_writer_options anb creates a writer + """ + + num_index_cols_meta = 0 + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + if self.index is not False: + if isinstance(table.index, cudf.MultiIndex): + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + self.tbl_meta.column_metadata[level].set_name(idx_name) + num_index_cols_meta = len(table.index.names) + else: + if table.index.name is not None: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + self.tbl_meta.column_metadata[0].set_name(table.index.name) + num_index_cols_meta = 1 + + has_map_type = self.cols_as_map_type is not None + for i, (name, col) in enumerate( + table._column_labels_and_values, start=num_index_cols_meta + ): + self.tbl_meta.column_metadata[i].set_name(name) + _set_col_children_metadata( + col, + self.tbl_meta.column_metadata[i], + has_map_type and name in self.cols_as_map_type, + ) + + user_data = { + "pandas": ioutils.generate_pandas_metadata(table, self.index) + } + + options = ( + plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) + .build() + ) + if self.stripe_size_bytes is not None: + options.set_stripe_size_bytes(self.stripe_size_bytes) + if self.stripe_size_rows is not None: + options.set_stripe_size_rows(self.stripe_size_rows) + if self.row_index_stride is not None: + options.set_row_index_stride(self.row_index_stride) + + self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) + + self.initialized = True + + +def _get_comp_type( + compression: Literal[False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"], +) -> plc.io.types.CompressionType: + if compression is None or compression is False: + return plc.io.types.CompressionType.NONE + + normed_compression = compression.upper() + if normed_compression == "SNAPPY": + return plc.io.types.CompressionType.SNAPPY + elif normed_compression == "ZLIB": + return plc.io.types.CompressionType.ZLIB + elif normed_compression == "ZSTD": + return plc.io.types.CompressionType.ZSTD + elif normed_compression == "LZ4": + return plc.io.types.CompressionType.LZ4 + else: + raise ValueError(f"Unsupported `compression` type {compression}") + + +def _get_orc_stat_freq( + statistics: Literal["NONE", "STRIPE", "ROWGROUP"], +) -> plc.io.types.StatisticsFreq: + """ + Convert ORC statistics terms to CUDF convention: + - ORC "STRIPE" == CUDF "ROWGROUP" + - ORC "ROWGROUP" == CUDF "PAGE" + """ + normed_statistics = statistics.upper() + if normed_statistics == "NONE": + return plc.io.types.StatisticsFreq.STATISTICS_NONE + elif normed_statistics == "STRIPE": + return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP + elif normed_statistics == "ROWGROUP": + return plc.io.types.StatisticsFreq.STATISTICS_PAGE + else: + raise ValueError(f"Unsupported `statistics_freq` type {statistics}") + + +def _set_col_children_metadata( + col: ColumnBase, + col_meta: plc.io.types.ColumnInMetadata, + list_column_as_map: bool = False, +) -> None: + if isinstance(col.dtype, cudf.StructDtype): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name) + _set_col_children_metadata( + child_col, col_meta.child(i), list_column_as_map + ) + elif isinstance(col.dtype, cudf.ListDtype): + if list_column_as_map: + col_meta.set_list_column_as_map() + _set_col_children_metadata( + col.children[1], col_meta.child(1), list_column_as_map + ) + else: + return diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl index 1ec077d10f7..64e06f0631d 100644 Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 65947efc2df..c3c9a1c5338 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -7,7 +7,6 @@ import pytest import cudf -from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal @@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data["a"]] - got_mask = mask_to_bools( - gd_data["a"]._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = gd_data["a"]._column._get_mask_as_column().values_host np.testing.assert_array_equal(expect_mask, got_mask) @@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data] - got_mask = mask_to_bools( - gd_data._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = gd_data._column._get_mask_as_column().values_host np.testing.assert_array_equal(expect_mask, got_mask) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 68f2aaf9cab..b50ed04427f 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from packaging import version import cudf from cudf.testing import _utils as utils, assert_eq @@ -149,13 +150,19 @@ def test_serialize(df, to_host): def test_serialize_dtype_error_checking(): dtype = cudf.IntervalDtype("float", "right") - header, frames = dtype.serialize() - with pytest.raises(AssertionError): - # Invalid number of frames - type(dtype).deserialize(header, [None] * (header["frame_count"] + 1)) + # Must call device_serialize (not serialize) to ensure that the type metadata is + # encoded in the header. + header, frames = dtype.device_serialize() with pytest.raises(AssertionError): # mismatching class cudf.StructDtype.deserialize(header, frames) + # The is-cuda flag list length must match the number of frames + header["is-cuda"] = [False] + with pytest.raises(AssertionError): + # Invalid number of frames + type(dtype).deserialize( + header, [np.zeros(1)] * (header["frame_count"] + 1) + ) def test_serialize_dataframe(): @@ -382,6 +389,10 @@ def test_serialize_string_check_buffer_sizes(): assert expect == got +@pytest.mark.skipif( + version.parse(np.__version__) < version.parse("2.0.0"), + reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x", +) def test_deserialize_cudf_23_12(datadir): fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl" diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 99bd9adb034..f8697c5c6b8 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -772,6 +772,69 @@ def test_round_nan_as_null_false(series, decimal): assert_eq(result, expected, atol=1e-10) +@pytest.mark.parametrize( + "data, dtype, decimals, expected_half_up, expected_half_even", + [ + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 2, + [1.23, 2.35, 3.46], + [1.23, 2.34, 3.46], + ), + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 0, + [1.0, 2.0, 3.0], + [1.0, 2.0, 3.0], + ), + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 3, + [1.234, 2.345, 3.456], + [1.234, 2.345, 3.456], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 4, + [1.2346, 2.3457, 3.4568], + [1.2346, 2.3457, 3.4568], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 2, + [1.23, 2.35, 3.46], + [1.23, 2.35, 3.46], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 6, + [1.234567, 2.345678, 3.456789], + [1.234567, 2.345678, 3.456789], + ), + ], +) +def test_series_round_decimal( + data, dtype, decimals, expected_half_up, expected_half_even +): + ser = cudf.Series(data).astype(dtype) + + result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype) + expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype) + assert_eq(result_half_up, expected_ser_half_up) + + result_half_even = ser.round(decimals=decimals, how="half_even").astype( + dtype + ) + expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype) + assert_eq(result_half_even, expected_ser_half_even) + + @pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_isnull_isna(ps, nan_as_null): diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 899d78c999b..b85943626a6 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -79,7 +79,7 @@ def test_series_construction_with_nulls(): ) def test_serialize_struct_dtype(fields): dtype = cudf.StructDtype(fields) - recreated = dtype.__class__.deserialize(*dtype.serialize()) + recreated = dtype.__class__.device_deserialize(*dtype.device_serialize()) assert recreated == dtype diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 3637ef075f2..9a62285403f 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,7 +882,7 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash_permuted(): +def test_minhash(): strings = cudf.Series(["this is my", "favorite book", None, ""]) params = cudf.Series([1, 2, 3], dtype=np.uint32) @@ -894,7 +894,7 @@ def test_minhash_permuted(): cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) + actual = strings.str.minhash(0, a=params, b=params, width=5) assert_eq(expected, actual) params = cudf.Series([1, 2, 3], dtype=np.uint64) @@ -912,78 +912,18 @@ def test_minhash_permuted(): cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) + actual = strings.str.minhash64(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash_permuted(1, a="a", b="b", width=7) + strings.str.minhash(1, a="a", b="b", width=7) with pytest.raises(ValueError): params = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash_permuted(1, a=params, b=params, width=6) + strings.str.minhash(1, a=params, b=params, width=6) with pytest.raises(ValueError): params = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64_permuted(1, a=params, b=params, width=8) - - -def test_word_minhash(): - ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - - expected = cudf.Series( - [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), - cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash(seeds=seeds) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - cudf.Series([2603139454418834912], dtype=np.uint64), - cudf.Series([5240044617220523711], dtype=np.uint64), - ] - ) - actual = ls.str.word_minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - expected = cudf.Series( - [ - cudf.Series( - [ - 2603139454418834912, - 8644371945174847701, - 5541030711534384340, - ], - dtype=np.uint64, - ), - cudf.Series( - [5240044617220523711, 5847101123925041457, 153762819128779913], - dtype=np.uint64, - ), - ] - ) - actual = ls.str.word_minhash64(seeds=seeds) - assert_eq(expected, actual) - - # test wrong seed types - with pytest.raises(ValueError): - ls.str.word_minhash(seeds="a") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - ls.str.word_minhash(seeds=seeds) - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - ls.str.word_minhash64(seeds=seeds) + strings.str.minhash64(1, a=params, b=params, width=8) def test_jaccard_index(): diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 5681601d2be..d9a3da6666d 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -3,37 +3,45 @@ import datetime import functools +import json import operator import os import urllib import warnings from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper from threading import Thread -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import fsspec import fsspec.implementations.local import numpy as np import pandas as pd +import pyarrow as pa from fsspec.core import expand_paths_if_needed, get_fs_token_paths import cudf from cudf.api.types import is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial +from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype try: import fsspec.parquet as fsspec_parquet - except ImportError: fsspec_parquet = None + if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Hashable from cudf.core.column import ColumnBase +PARQUET_META_TYPE_MAP = { + str(cudf_dtype): str(pandas_dtype) + for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items() +} + _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 _ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max @@ -1487,6 +1495,153 @@ ) +def _index_level_name( + index_name: Hashable, level: int, column_names: list[Hashable] +) -> Hashable: + """ + Return the name of an index level or a default name + if `index_name` is None or is already a column name. + + Parameters + ---------- + index_name : name of an Index object + level : level of the Index object + + Returns + ------- + name : str + """ + if index_name is not None and index_name not in column_names: + return index_name + else: + return f"__index_level_{level}__" + + +def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str: + col_names: list[Hashable] = [] + types = [] + index_levels = [] + index_descriptors = [] + columns_to_convert = list(table._columns) + # Columns + for name, col in table._column_labels_and_values: + if cudf.get_option("mode.pandas_compatible"): + # in pandas-compat mode, non-string column names are stringified. + col_names.append(str(name)) + else: + col_names.append(name) + + if isinstance(col.dtype, cudf.CategoricalDtype): + raise ValueError( + "'category' column dtypes are currently not " + + "supported by the gpu accelerated parquet writer" + ) + elif isinstance( + col.dtype, + (cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype), + ): + types.append(col.dtype.to_arrow()) + else: + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interoperable + # with arrow, we use `int8` type when converting from a + # cudf boolean array. + if col.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(col.dtype)) + + # Indexes + materialize_index = False + if index is not False: + for level, name in enumerate(table.index.names): + if isinstance(table.index, cudf.MultiIndex): + idx = table.index.get_level_values(level) + else: + idx = table.index + + if isinstance(idx, cudf.RangeIndex): + if index is None: + descr: dict[str, Any] | Hashable = { + "kind": "range", + "name": table.index.name, + "start": table.index.start, + "stop": table.index.stop, + "step": table.index.step, + } + else: + materialize_index = True + # When `index=True`, RangeIndex needs to be materialized. + materialized_idx = idx._as_int_index() + descr = _index_level_name( + index_name=materialized_idx.name, + level=level, + column_names=col_names, + ) + index_levels.append(materialized_idx) + columns_to_convert.append(materialized_idx._values) + col_names.append(descr) + types.append(np_to_pa_dtype(materialized_idx.dtype)) + else: + descr = _index_level_name( + index_name=idx.name, level=level, column_names=col_names + ) + columns_to_convert.append(idx._values) + col_names.append(descr) + if isinstance(idx.dtype, cudf.CategoricalDtype): + raise ValueError( + "'category' column dtypes are currently not " + + "supported by the gpu accelerated parquet writer" + ) + elif isinstance(idx.dtype, cudf.ListDtype): + types.append(col.dtype.to_arrow()) + else: + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interperable + # in arrow, we use `int8` type when converting from a + # cudf boolean array. + if idx.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(idx.dtype)) + + index_levels.append(idx) + index_descriptors.append(descr) + + df_meta = table.head(0) + if materialize_index: + df_meta.index = df_meta.index._as_int_index() + metadata = pa.pandas_compat.construct_metadata( + columns_to_convert=columns_to_convert, + # It is OKAY to do `.head(0).to_pandas()` because + # this method will extract `.columns` metadata only + df=df_meta.to_pandas(), + column_names=col_names, + index_levels=index_levels, + index_descriptors=index_descriptors, + preserve_index=index, + types=types, + ) + + md_dict = json.loads(metadata[b"pandas"]) + + # correct metadata for list and struct and nullable numeric types + for col_meta in md_dict["columns"]: + if ( + col_meta["name"] in table._column_names + and table._data[col_meta["name"]].nullable + and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP + and col_meta["pandas_type"] != "decimal" + ): + col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[ + col_meta["numpy_type"] + ] + if col_meta["numpy_type"] in ("list", "struct"): + col_meta["numpy_type"] = "object" + + return json.dumps(md_dict) + + def is_url(url): """Check if a string is a valid URL to a network location. diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 80de9056a0a..21c18ef0174 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,7 +20,7 @@ requires-python = ">=3.10" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=11.8.5,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.2.*,>=0.0.0a0", diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1faa778ccf6..b5af3bb80bf 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -517,17 +517,22 @@ def do_evaluate( elif typ == "parquet": parquet_options = config_options.get("parquet_options", {}) if parquet_options.get("chunked", True): + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(paths) + ).build() + # We handle skip_rows != 0 by reading from the + # up to n_rows + skip_rows and slicing off the + # first skip_rows entries. + # TODO: Remove this workaround once + # https://github.com/rapidsai/cudf/issues/16186 + # is fixed + nrows = n_rows + skip_rows + if nrows > -1: + options.set_num_rows(nrows) + if with_columns is not None: + options.set_columns(with_columns) reader = plc.io.parquet.ChunkedParquetReader( - plc.io.SourceInfo(paths), - columns=with_columns, - # We handle skip_rows != 0 by reading from the - # up to n_rows + skip_rows and slicing off the - # first skip_rows entries. - # TODO: Remove this workaround once - # https://github.com/rapidsai/cudf/issues/16186 - # is fixed - nrows=n_rows + skip_rows, - skip_rows=0, + options, chunk_read_limit=parquet_options.get( "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE ), @@ -573,13 +578,18 @@ def slice_skip(tbl: plc.Table): if predicate is not None and row_index is None: # Can't apply filters during read if we have a row index. filters = to_parquet_filter(predicate.value) - tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(paths), - columns=with_columns, - filters=filters, - nrows=n_rows, - skip_rows=skip_rows, - ) + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(paths) + ).build() + if n_rows != -1: + options.set_num_rows(n_rows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if with_columns is not None: + options.set_columns(with_columns) + if filters is not None: + options.set_filter(filters) + tbl_w_meta = plc.io.parquet.read_parquet(options) df = DataFrame.from_table( tbl_w_meta.tbl, # TODO: consider nested column names? diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 2dc4031b876..5192e6b8171 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -163,6 +163,11 @@ def read_text(*args, **kwargs): return legacy_read_text(*args, **kwargs) + def clip(self, lower=None, upper=None, axis=1): + if axis not in (None, 1): + raise NotImplementedError("axis not yet supported in clip.") + return new_collection(self.expr.clip(lower, upper, 1)) + class Series(DXSeries, CudfFrameBase): def groupby(self, by, **kwargs): @@ -182,6 +187,11 @@ def struct(self): return StructMethods(self) + def clip(self, lower=None, upper=None, axis=1): + if axis not in (None, 1): + raise NotImplementedError("axis not yet supported in clip.") + return new_collection(self.expr.clip(lower, upper, 1)) + class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cda7e2d134d..7101fb7e00a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1019,3 +1019,29 @@ def test_rename_axis_after_join(): result = ddf1.join(ddf2, how="outer") expected = df1.join(df2, how="outer") dd.assert_eq(result, expected, check_index=False) + + +def test_clip_dataframe(): + df = cudf.DataFrame( + { + "id": ["a", "b", "c", "d"], + "score": [-1, 1, 4, 6], + } + ) + expect = df.clip(lower=["b", 1], upper=["d", 5], axis=1) + got = dd.from_pandas(df, npartitions=2).clip( + lower=["b", 1], upper=["d", 5], axis=1 + ) + dd.assert_eq(expect, got) + + +def test_clip_series(): + ser = cudf.Series([-0.5, 0.5, 4.5, 5.5]) + expect = ser.clip(lower=0, upper=5).round().astype(int) + got = ( + dd.from_pandas(ser, npartitions=2) + .clip(lower=0, upper=5) + .round() + .astype(int) + ) + dd.assert_eq(expect, got) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index d03180852eb..c28b7e49207 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -4,7 +4,7 @@ import pytest import dask -from dask import dataframe as dd +from dask import array as da, dataframe as dd from dask.distributed import Client from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401 @@ -121,3 +121,17 @@ def test_unique(): ddf.x.unique().compute(), check_index=False, ) + + +def test_serialization_of_numpy_types(): + # Dask uses numpy integers as column names, which can break cudf serialization + with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: + with Client(cluster): + with dask.config.set( + {"dataframe.backend": "cudf", "array.backend": "cupy"} + ): + rng = da.random.default_rng() + X_arr = rng.random((100, 10), chunks=(50, 10)) + X = dd.from_dask_array(X_arr) + X = X[X.columns[0]] + X.compute() diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 7bd6ba91ca9..84f47cf5305 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_reader as cpp_chunked_parquet_reader, parquet_writer_options, parquet_writer_options_builder, + parquet_reader_options, + parquet_reader_options_builder, chunked_parquet_writer_options, chunked_parquet_writer_options_builder, ) @@ -27,6 +29,25 @@ from pylibcudf.table cimport Table from pylibcudf.types cimport DataType +cdef class ParquetReaderOptions: + cdef parquet_reader_options c_obj + cdef SourceInfo source + cpdef void set_row_groups(self, list row_groups) + cpdef void set_num_rows(self, size_type nrows) + cpdef void set_skip_rows(self, int64_t skip_rows) + cpdef void set_columns(self, list col_names) + cpdef void set_filter(self, Expression filter) + +cdef class ParquetReaderOptionsBuilder: + cdef parquet_reader_options_builder c_obj + cdef SourceInfo source + cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val) + cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val) + cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val) + cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val) + cpdef build(self) + + cdef class ChunkedParquetReader: cdef unique_ptr[cpp_chunked_parquet_reader] reader @@ -34,20 +55,7 @@ cdef class ChunkedParquetReader: cpdef TableWithMetadata read_chunk(self) -cpdef read_parquet( - SourceInfo source_info, - list columns = *, - list row_groups = *, - Expression filters = *, - bool convert_strings_to_categories = *, - bool use_pandas_metadata = *, - int64_t skip_rows = *, - size_type nrows = *, - bool allow_mismatched_pq_schemas = *, - # disabled see comment in parquet.pyx for more - # ReaderColumnSchema reader_column_schema = *, - # DataType timestamp_type = * -) +cpdef read_parquet(ParquetReaderOptions options) cdef class ParquetChunkedWriter: diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index 22bea1abd8e..2d8d12c1a45 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from collections.abc import Mapping -from typing import Self + +from typing_extensions import Self from pylibcudf.expressions import Expression from pylibcudf.io.types import ( @@ -16,6 +17,24 @@ from pylibcudf.io.types import ( ) from pylibcudf.table import Table +class ParquetReaderOptions: + def __init__(self): ... + def set_row_groups(self, row_groups: list[list[int]]): ... + def set_num_rows(self, nrows: int): ... + def set_skip_rows(self, skip_rows: int): ... + def set_columns(self, col_names: list[str]): ... + def set_filter(self, filter: Expression): ... + @staticmethod + def builder(source: SourceInfo) -> ParquetReaderOptionsBuilder: ... + +class ParquetReaderOptionsBuilder: + def __init__(self): ... + def convert_strings_to_categories(self, val: bool) -> Self: ... + def use_pandas_metadata(self, val: bool) -> Self: ... + def allow_mismatched_pq_schemas(self, val: bool) -> Self: ... + def use_arrow_schema(self, val: bool) -> Self: ... + def build(self) -> ParquetReaderOptions: ... + class ChunkedParquetReader: def __init__( self, diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 9bdf849a30c..672fe2be847 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -42,47 +42,204 @@ __all__ = [ "ParquetWriterOptionsBuilder", "read_parquet", "write_parquet", + "ParquetReaderOptions", + "ParquetReaderOptionsBuilder", "ChunkedParquetWriterOptions", "ChunkedParquetWriterOptionsBuilder" "merge_row_group_metadata", ] -cdef parquet_reader_options _setup_parquet_reader_options( - SourceInfo source_info, - list columns = None, - list row_groups = None, - Expression filters = None, - bool convert_strings_to_categories = False, - bool use_pandas_metadata = True, - int64_t skip_rows = 0, - size_type nrows = -1, - bool allow_mismatched_pq_schemas=False, - # ReaderColumnSchema reader_column_schema = None, - # DataType timestamp_type = DataType(type_id.EMPTY) -): - cdef vector[string] col_vec - cdef parquet_reader_options opts = ( - parquet_reader_options.builder(source_info.c_obj) - .convert_strings_to_categories(convert_strings_to_categories) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .use_arrow_schema(True) - .build() - ) - if row_groups is not None: - opts.set_row_groups(row_groups) - if nrows != -1: - opts.set_num_rows(nrows) - if skip_rows != 0: - opts.set_skip_rows(skip_rows) - if columns is not None: - col_vec.reserve(len(columns)) - for col in columns: - col_vec.push_back(str(col).encode()) - opts.set_columns(col_vec) - if filters is not None: - opts.set_filter(dereference(filters.c_obj.get())) - return opts + +cdef class ParquetReaderOptions: + """The settings to use for ``read_parquet`` + For details, see :cpp:class:`cudf::io::parquet_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a ParquetReaderOptionsBuilder object + + For details, see :cpp:func:`cudf::io::parquet_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the Parquet file from. + + Returns + ------- + ParquetReaderOptionsBuilder + Builder to build ParquetReaderOptions + """ + cdef ParquetReaderOptionsBuilder parquet_builder = ( + ParquetReaderOptionsBuilder.__new__(ParquetReaderOptionsBuilder) + ) + parquet_builder.c_obj = parquet_reader_options.builder(source.c_obj) + parquet_builder.source = source + return parquet_builder + + cpdef void set_row_groups(self, list row_groups): + """ + Sets list of individual row groups to read. + + Parameters + ---------- + row_groups : list + List of row groups to read + + Returns + ------- + None + """ + cdef vector[vector[size_type]] outer + cdef vector[size_type] inner + for row_group in row_groups: + for x in row_group: + inner.push_back(x) + outer.push_back(inner) + inner.clear() + + self.c_obj.set_row_groups(outer) + + cpdef void set_num_rows(self, size_type nrows): + """ + Sets number of rows to read. + + Parameters + ---------- + nrows : size_type + Number of rows to read after skip + + Returns + ------- + None + """ + self.c_obj.set_num_rows(nrows) + + cpdef void set_skip_rows(self, int64_t skip_rows): + """ + Sets number of rows to skip. + + Parameters + ---------- + skip_rows : int64_t + Number of rows to skip from start + + Returns + ------- + None + """ + self.c_obj.set_skip_rows(skip_rows) + + cpdef void set_columns(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.set_columns(vec) + + cpdef void set_filter(self, Expression filter): + """ + Sets AST based filter for predicate pushdown. + + Parameters + ---------- + filter : Expression + AST expression to use as filter + + Returns + ------- + None + """ + self.c_obj.set_filter(dereference(filter.c_obj.get())) + + +cdef class ParquetReaderOptionsBuilder: + cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val): + """ + Sets enable/disable conversion of strings to categories. + + Parameters + ---------- + val : bool + Boolean value to enable/disable conversion of string columns to categories + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.convert_strings_to_categories(val) + return self + + cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val): + """ + Sets to enable/disable use of pandas metadata to read. + + Parameters + ---------- + val : bool + Boolean value whether to use pandas metadata + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.use_pandas_metadata(val) + return self + + cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val): + """ + Sets to enable/disable reading of matching projected and filter + columns from mismatched Parquet sources. + + Parameters + ---------- + val : bool + Boolean value whether to read matching projected and filter + columns from mismatched Parquet sources. + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.allow_mismatched_pq_schemas(val) + return self + + cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val): + """ + Sets to enable/disable use of arrow schema to read. + + Parameters + ---------- + val : bool + Boolean value whether to use arrow schema + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.use_arrow_schema(val) + return self + + cpdef build(self): + """Create a ParquetReaderOptions object""" + cdef ParquetReaderOptions parquet_options = ParquetReaderOptions.__new__( + ParquetReaderOptions + ) + parquet_options.c_obj = move(self.c_obj.build()) + parquet_options.source = self.source + return parquet_options cdef class ChunkedParquetReader: @@ -93,63 +250,27 @@ cdef class ChunkedParquetReader: Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the Parquet file from. - columns : list, default None - The names of the columns to be read - row_groups : list[list[size_type]], default None - List of row groups to be read. - use_pandas_metadata : bool, default True - If True, return metadata about the index column in - the per-file user metadata of the ``TableWithMetadata`` - convert_strings_to_categories : bool, default False - Whether to convert string columns to the category type - skip_rows : int64_t, default 0 - The number of rows to skip from the start of the file. - nrows : size_type, default -1 - The number of rows to read. By default, read the entire file. + options : ParquetReaderOptions + Settings for controlling reading behavior chunk_read_limit : size_t, default 0 Limit on total number of bytes to be returned per read, or 0 if there is no limit. pass_read_limit : size_t, default 1024000000 Limit on the amount of memory used for reading and decompressing data or 0 if there is no limit. - allow_mismatched_pq_schemas : bool, default False - Whether to read (matching) columns specified in `columns` from - the input files with otherwise mismatched schemas. """ def __init__( self, - SourceInfo source_info, - list columns=None, - list row_groups=None, - bool use_pandas_metadata=True, - bool convert_strings_to_categories=False, - int64_t skip_rows = 0, - size_type nrows = -1, + ParquetReaderOptions options, size_t chunk_read_limit=0, size_t pass_read_limit=1024000000, - bool allow_mismatched_pq_schemas=False ): - - cdef parquet_reader_options opts = _setup_parquet_reader_options( - source_info, - columns, - row_groups, - filters=None, - convert_strings_to_categories=convert_strings_to_categories, - use_pandas_metadata=use_pandas_metadata, - skip_rows=skip_rows, - nrows=nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - ) - with nogil: self.reader.reset( new cpp_chunked_parquet_reader( chunk_read_limit, pass_read_limit, - opts + options.c_obj, ) ) @@ -184,69 +305,23 @@ cdef class ChunkedParquetReader: return TableWithMetadata.from_libcudf(c_result) -cpdef read_parquet( - SourceInfo source_info, - list columns = None, - list row_groups = None, - Expression filters = None, - bool convert_strings_to_categories = False, - bool use_pandas_metadata = True, - int64_t skip_rows = 0, - size_type nrows = -1, - bool allow_mismatched_pq_schemas = False, - # Disabled, these aren't used by cudf-python - # we should only add them back in if there's user demand - # ReaderColumnSchema reader_column_schema = None, - # DataType timestamp_type = DataType(type_id.EMPTY) -): - """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`. + +cpdef read_parquet(ParquetReaderOptions options): + """ + Read from Parquet format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_parquet`. Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the Parquet file from. - columns : list, default None - The string names of the columns to be read. - row_groups : list[list[size_type]], default None - List of row groups to be read. - filters : Expression, default None - An AST :py:class:`pylibcudf.expressions.Expression` - to use for predicate pushdown. - convert_strings_to_categories : bool, default False - Whether to convert string columns to the category type - use_pandas_metadata : bool, default True - If True, return metadata about the index column in - the per-file user metadata of the ``TableWithMetadata`` - skip_rows : int64_t, default 0 - The number of rows to skip from the start of the file. - nrows : size_type, default -1 - The number of rows to read. By default, read the entire file. - allow_mismatched_pq_schemas : bool, default False - If True, enable reading (matching) columns specified in `columns` - from the input files with otherwise mismatched schemas. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: ParquetReaderOptions + Settings for controlling reading behavior """ - cdef table_with_metadata c_result - cdef parquet_reader_options opts = _setup_parquet_reader_options( - source_info, - columns, - row_groups, - filters, - convert_strings_to_categories, - use_pandas_metadata, - skip_rows, - nrows, - allow_mismatched_pq_schemas, - ) - with nogil: - c_result = move(cpp_read_parquet(opts)) + c_result = move(cpp_read_parquet(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index a1f3b17936c..61fe33d6805 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -65,7 +65,6 @@ cdef class ColumnInMetadata: cdef class TableInputMetadata: cdef table_input_metadata c_obj - cdef list column_metadata cdef class TableWithMetadata: cdef public Table tbl diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi index a3a559219ff..63fa9d1ff79 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyi +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -64,6 +64,8 @@ class PartitionInfo: class TableInputMetadata: def __init__(self, table: Table): ... + @property + def column_metadata(self) -> list[ColumnInMetadata]: ... class ColumnInMetadata: def set_name(self, name: str) -> Self: ... diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index a2155829f2c..458595ca0e0 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -288,12 +288,14 @@ cdef class TableInputMetadata: """ def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) - self.column_metadata = [ + + @property + def column_metadata(self): + return [ ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self) for i in range(self.c_obj.column_metadata.size()) ] - cdef class TableWithMetadata: """A container holding a table and its associated metadata (e.g. column names) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 8570531dfde..9d1e8cba425 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -11,18 +11,6 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( - const column_view &strings, - const numeric_scalar[uint32_t] seed, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash( - const column_view &strings, - const column_view &seeds, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash_permuted( const column_view &strings, const uint32_t seed, const column_view &a, @@ -31,31 +19,9 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: ) except + cdef unique_ptr[column] minhash64( - const column_view &strings, - const column_view &seeds, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash64( - const column_view &strings, - const numeric_scalar[uint64_t] seed, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash64_permuted( const column_view &strings, const uint64_t seed, const column_view &a, const column_view &b, const size_type width, ) except + - - cdef unique_ptr[column] word_minhash( - const column_view &input, - const column_view &seeds - ) except +libcudf_exception_handler - - cdef unique_ptr[column] word_minhash64( - const column_view &input, - const column_view &seeds - ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 6b544282f44..0af53748cdc 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -9,9 +9,7 @@ ctypedef fused ColumnOrScalar: Column Scalar -cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) - -cpdef Column minhash_permuted( +cpdef Column minhash( Column input, uint32_t seed, Column a, @@ -19,16 +17,10 @@ cpdef Column minhash_permuted( size_type width ) -cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) - -cpdef Column minhash64_permuted( +cpdef Column minhash64( Column input, uint64_t seed, Column a, Column b, size_type width ) - -cpdef Column word_minhash(Column input, Column seeds) - -cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi index a2d9b6364f7..5d88cfbbea0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -1,13 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column import Column -from pylibcudf.scalar import Scalar def minhash( - input: Column, seeds: Column | Scalar, width: int = 4 + input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... def minhash64( - input: Column, seeds: Column | Scalar, width: int = 4 + input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... -def word_minhash(input: Column, seeds: Column) -> Column: ... -def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 5448cc6de9b..84811cda867 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,69 +8,15 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, - minhash64_permuted as cpp_minhash64_permuted, - minhash_permuted as cpp_minhash_permuted, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, ) -from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type -from pylibcudf.scalar cimport Scalar - -from cython.operator import dereference -import warnings __all__ = [ "minhash", "minhash64", - "word_minhash", - "word_minhash64", ] -cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): - """ - Returns the minhash values for each string per seed. - This function uses MurmurHash3_x86_32 for the hash algorithm. - - For details, see :cpp:func:`minhash`. - - Parameters - ---------- - input : Column - Strings column to compute minhash - seeds : Column or Scalar - Seed value(s) used for the hash algorithm. - width : size_type - Character width used for apply substrings; - Default is 4 characters. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - warnings.warn( - "Starting in version 25.02, the signature of this function will " - "be changed to match pylibcudf.nvtext.minhash_permuted.", - FutureWarning - ) - - cdef unique_ptr[column] c_result - - if not isinstance(seeds, (Column, Scalar)): - raise TypeError("Must pass a Column or Scalar") - - with nogil: - c_result = cpp_minhash( - input.view(), - seeds.view() if ColumnOrScalar is Column else - dereference(seeds.c_obj.get()), - width - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column minhash_permuted( +cpdef Column minhash( Column input, uint32_t seed, Column a, @@ -81,7 +27,7 @@ cpdef Column minhash_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x86_32 for the hash algorithm. - For details, see :cpp:func:`minhash_permuted`. + For details, see :cpp:func:`minhash`. Parameters ---------- @@ -104,7 +50,7 @@ cpdef Column minhash_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = cpp_minhash_permuted( + c_result = cpp_minhash( input.view(), seed, a.view(), @@ -114,50 +60,7 @@ cpdef Column minhash_permuted( return Column.from_libcudf(move(c_result)) -cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): - """ - Returns the minhash values for each string per seed. - This function uses MurmurHash3_x64_128 for the hash algorithm. - - For details, see :cpp:func:`minhash64`. - - Parameters - ---------- - input : Column - Strings column to compute minhash - seeds : Column or Scalar - Seed value(s) used for the hash algorithm. - width : size_type - Character width used for apply substrings; - Default is 4 characters. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - warnings.warn( - "Starting in version 25.02, the signature of this function will " - "be changed to match pylibcudf.nvtext.minhash64_permuted.", - FutureWarning - ) - - cdef unique_ptr[column] c_result - - if not isinstance(seeds, (Column, Scalar)): - raise TypeError("Must pass a Column or Scalar") - - with nogil: - c_result = cpp_minhash64( - input.view(), - seeds.view() if ColumnOrScalar is Column else - dereference(seeds.c_obj.get()), - width - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column minhash64_permuted( +cpdef Column minhash64( Column input, uint64_t seed, Column a, @@ -168,7 +71,7 @@ cpdef Column minhash64_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x64_128 for the hash algorithm. - For details, see :cpp:func:`minhash64_permuted`. + For details, see :cpp:func:`minhash64`. Parameters ---------- @@ -191,7 +94,7 @@ cpdef Column minhash64_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = cpp_minhash64_permuted( + c_result = cpp_minhash64( input.view(), seed, a.view(), @@ -200,62 +103,3 @@ cpdef Column minhash64_permuted( ) return Column.from_libcudf(move(c_result)) - -cpdef Column word_minhash(Column input, Column seeds): - """ - Returns the minhash values for each row of strings per seed. - This function uses MurmurHash3_x86_32 for the hash algorithm. - - For details, see :cpp:func:`word_minhash`. - - Parameters - ---------- - input : Column - Lists column of strings to compute minhash - seeds : Column or Scalar - Seed values used for the hash algorithm. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - cdef unique_ptr[column] c_result - - with nogil: - c_result = cpp_word_minhash( - input.view(), - seeds.view() - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column word_minhash64(Column input, Column seeds): - """ - Returns the minhash values for each row of strings per seed. - This function uses MurmurHash3_x64_128 for the hash algorithm though - only the first 64-bits of the hash are used in computing the output. - - For details, see :cpp:func:`word_minhash64`. - - Parameters - ---------- - input : Column - Lists column of strings to compute minhash - seeds : Column or Scalar - Seed values used for the hash algorithm. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - cdef unique_ptr[column] c_result - - with nogil: - c_result = cpp_word_minhash64( - input.view(), - seeds.view() - ) - - return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index 94524acbcc8..da535809745 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -31,19 +31,24 @@ def test_read_parquet_basic( binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS ) - res = plc.io.parquet.read_parquet( - plc.io.SourceInfo([source]), - nrows=nrows, - skip_rows=skiprows, - columns=columns, - ) + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + if nrows > -1: + options.set_num_rows(nrows) + if skiprows != 0: + options.set_skip_rows(skiprows) + if columns is not None: + options.set_columns(columns) + + res = plc.io.parquet.read_parquet(options) if columns is not None: pa_table = pa_table.select(columns) # Adapt to nrows/skiprows pa_table = pa_table.slice( - offset=skiprows, length=nrows if nrows != -1 else None + offset=skiprows, length=nrows if nrows > -1 else None ) assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) @@ -95,9 +100,12 @@ def test_read_parquet_filters( binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS ) - plc_table_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo([source]), filters=plc_filters - ) + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_filter(plc_filters) + + plc_table_w_meta = plc.io.parquet.read_parquet(options) exp = read_table(source, filters=pa_filters) assert_table_and_meta_eq( exp, plc_table_w_meta, check_field_nullability=False diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ec533e64307..ad7a6f7a762 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -13,20 +13,13 @@ def minhash_input_data(request): return input_arr, seeds, request.param -@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) -def word_minhash_input_data(request): - input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) - seeds = pa.array([2, 3, 4, 5], request.param) - return input_arr, seeds, request.param - - @pytest.mark.parametrize("width", [5, 12]) -def test_minhash_permuted(minhash_input_data, width): +def test_minhash(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash_permuted + plc.nvtext.minhash.minhash if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64_permuted + else plc.nvtext.minhash.minhash64 ) result = minhash_func( plc.interop.from_arrow(input_arr), @@ -40,20 +33,3 @@ def test_minhash_permuted(minhash_input_data, width): assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) - - -def test_word_minhash(word_minhash_input_data): - input_arr, seeds, seed_type = word_minhash_input_data - word_minhash_func = ( - plc.nvtext.minhash.word_minhash - if seed_type == pa.uint32() - else plc.nvtext.minhash.word_minhash64 - ) - result = word_minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) - ) - pa_result = plc.interop.to_arrow(result) - assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) - assert pa_result.type == pa.list_( - pa.field("element", seed_type, nullable=False) - ) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index a5e5704b8ed..53ee3e2b56e 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=11.8.5,<12.0a0", "libcudf==25.2.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging",