From 92652be87839e4a4e49216c49bd36860674bff6a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:17:28 -0800 Subject: [PATCH 1/4] Remove cudf._lib.parquet in favor of inlining pylibcudf (#17562) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17562 --- python/cudf/cudf/_lib/CMakeLists.txt | 5 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/io/CMakeLists.txt | 21 - python/cudf/cudf/_lib/io/__init__.pxd | 0 python/cudf/cudf/_lib/io/__init__.py | 0 python/cudf/cudf/_lib/io/utils.pxd | 31 - python/cudf/cudf/_lib/io/utils.pyx | 74 -- python/cudf/cudf/_lib/parquet.pyx | 817 ------------------- python/cudf/cudf/io/parquet.py | 992 +++++++++++++++++++++--- python/cudf/cudf/tests/test_parquet.py | 72 +- python/cudf/cudf/utils/ioutils.py | 1 - 11 files changed, 941 insertions(+), 1073 deletions(-) delete mode 100644 python/cudf/cudf/_lib/io/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/io/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/io/__init__.py delete mode 100644 python/cudf/cudf/_lib/io/utils.pxd delete mode 100644 python/cudf/cudf/_lib/io/utils.pyx delete mode 100644 python/cudf/cudf/_lib/parquet.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index efe96ff6c3e..f422635d22a 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx - sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx + column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx + stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) @@ -31,5 +31,4 @@ include(${rapids-cmake-dir}/export/find_package_root.cmake) include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) target_link_libraries(interop PUBLIC nanoarrow) -add_subdirectory(io) add_subdirectory(nvtext) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 52e9b89da7b..cfdcec4cd3b 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -7,7 +7,6 @@ groupby, interop, nvtext, - parquet, reduce, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt deleted file mode 100644 index e7408cf2852..00000000000 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources utils.pyx) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd deleted file mode 100644 index 9b8bab012e2..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - sink_info, - source_info, -) - -from cudf._lib.column cimport Column - - -cdef add_df_col_struct_names( - df, - child_names_dict -) -cdef update_col_struct_field_names( - Column col, - child_names -) -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -) -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx deleted file mode 100644 index df4675be599..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - - -from libcpp.string cimport string - -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.types cimport column_name_info - -from cudf._lib.column cimport Column - -from cudf.core.dtypes import StructDtype - -cdef add_df_col_struct_names(df, child_names_dict): - for name, child_names in child_names_dict.items(): - col = df._data[name] - - df._data[name] = update_col_struct_field_names(col, child_names) - - -cdef update_col_struct_field_names(Column col, child_names): - if col.children: - children = list(col.children) - for i, (child, names) in enumerate(zip(children, child_names.values())): - children[i] = update_col_struct_field_names( - child, - names - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - col = col._rename_fields( - child_names.keys() - ) - - return col - - -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -): - # Deprecated, remove in favor of add_col_struct_names - # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._column_labels_and_values): - table._data[name] = update_column_struct_field_names( - col, schema_info[i] - ) - - -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -): - cdef vector[string] field_names - - if col.children: - children = list(col.children) - for i, child in enumerate(children): - children[i] = update_column_struct_field_names( - child, - info.children[i] - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - field_names.reserve(len(col.base_children)) - for i in range(info.children.size()): - field_names.push_back(info.children[i].name) - col = col._rename_fields( - field_names - ) - - return col diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx deleted file mode 100644 index 00c434ae374..00000000000 --- a/python/cudf/cudf/_lib/parquet.pyx +++ /dev/null @@ -1,817 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import io - -import pyarrow as pa -import itertools -import cudf -from cudf.core.buffer import acquire_spill_lock - -try: - import ujson as json -except ImportError: - import json - -import numpy as np - -from cudf.api.types import is_list_like - -from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io - -from cudf._lib.utils import _index_level_name, generate_pandas_metadata - -from libc.stdint cimport int64_t -from libcpp cimport bool - -from pylibcudf.expressions cimport Expression -from pylibcudf.io.parquet cimport ChunkedParquetReader -from pylibcudf.libcudf.io.types cimport ( - statistics_freq, - compression_type, - dictionary_policy, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - add_df_col_struct_names, -) - -import pylibcudf as plc - -from pylibcudf cimport Table - -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT -from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata -from pylibcudf.io.parquet cimport ParquetChunkedWriter - - -def _parse_metadata(meta): - file_is_range_index = False - file_index_cols = None - file_column_dtype = None - - if 'index_columns' in meta and len(meta['index_columns']) > 0: - file_index_cols = meta['index_columns'] - - if isinstance(file_index_cols[0], dict) and \ - file_index_cols[0]['kind'] == 'range': - file_is_range_index = True - if 'column_indexes' in meta and len(meta['column_indexes']) == 1: - file_column_dtype = meta['column_indexes'][0]["numpy_type"] - return file_is_range_index, file_index_cols, file_column_dtype - - -cdef object _process_metadata(object df, - list names, - dict child_names, - list per_file_user_data, - object row_groups, - object filepaths_or_buffers, - bool allow_range_index, - bool use_pandas_metadata, - size_type nrows=-1, - int64_t skip_rows=0, - ): - - add_df_col_struct_names(df, child_names) - index_col = None - is_range_index = True - column_index_type = None - index_col_names = None - meta = None - for single_file in per_file_user_data: - if b'pandas' not in single_file: - continue - json_str = single_file[b'pandas'].decode('utf-8') - meta = json.loads(json_str) - file_is_range_index, index_col, column_index_type = _parse_metadata(meta) - is_range_index &= file_is_range_index - - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] - - if meta is not None: - # Book keep each column metadata as the order - # of `meta["columns"]` and `column_names` are not - # guaranteed to be deterministic and same always. - meta_data_per_column = { - col_meta['name']: col_meta for col_meta in meta["columns"] - } - - # update the decimal precision of each column - for col in names: - if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): - df._data[col].dtype.precision = ( - meta_data_per_column[col]["metadata"]["precision"] - ) - - # Set the index column - if index_col is not None and len(index_col) > 0: - if is_range_index: - if not allow_range_index: - return df - - if len(per_file_user_data) > 1: - range_index_meta = { - "kind": "range", - "name": None, - "start": 0, - "stop": len(df), - "step": 1 - } - else: - range_index_meta = index_col[0] - - if row_groups is not None: - per_file_metadata = [ - pa.parquet.read_metadata( - # Pyarrow cannot read directly from bytes - io.BytesIO(s) if isinstance(s, bytes) else s - ) for s in filepaths_or_buffers - ] - - filtered_idx = [] - for i, file_meta in enumerate(per_file_metadata): - row_groups_i = [] - start = 0 - for row_group in range(file_meta.num_row_groups): - stop = start + file_meta.row_group(row_group).num_rows - row_groups_i.append((start, stop)) - start = stop - - for rg in row_groups[i]: - filtered_idx.append( - cudf.RangeIndex( - start=row_groups_i[rg][0], - stop=row_groups_i[rg][1], - step=range_index_meta['step'] - ) - ) - - if len(filtered_idx) > 0: - idx = cudf.concat(filtered_idx) - else: - idx = cudf.Index._from_column(cudf.core.column.column_empty(0)) - else: - start = range_index_meta["start"] + skip_rows - stop = range_index_meta["stop"] - if nrows > -1: - stop = start + nrows - idx = cudf.RangeIndex( - start=start, - stop=stop, - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - - df._index = idx - elif set(index_col).issubset(names): - index_data = df[index_col] - actual_index_names = iter(index_col_names.values()) - if index_data._num_columns == 1: - idx = cudf.Index._from_column( - index_data._columns[0], - name=next(actual_index_names) - ) - else: - idx = cudf.MultiIndex.from_frame( - index_data, - names=list(actual_index_names) - ) - df.drop(columns=index_col, inplace=True) - df._index = idx - else: - if use_pandas_metadata: - df.index.names = index_col - - if df._num_columns == 0 and column_index_type is not None: - df._data.label_dtype = cudf.dtype(column_index_type) - - return df - - -def read_parquet_chunked( - filepaths_or_buffers, - columns=None, - row_groups=None, - use_pandas_metadata=True, - size_t chunk_read_limit=0, - size_t pass_read_limit=1024000000, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False -): - # Note: If this function ever takes accepts filters - # allow_range_index needs to be False when a filter is passed - # (see read_parquet) - allow_range_index = columns is not None and len(columns) != 0 - - options = ( - plc.io.parquet.ParquetReaderOptions.builder( - plc.io.SourceInfo(filepaths_or_buffers) - ) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .build() - ) - if row_groups is not None: - options.set_row_groups(row_groups) - if nrows > -1: - options.set_num_rows(nrows) - if skip_rows != 0: - options.set_skip_rows(skip_rows) - if columns is not None: - options.set_columns(columns) - - reader = ChunkedParquetReader( - options, - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - ) - - tbl_w_meta = reader.read_chunk() - column_names = tbl_w_meta.column_names(include_children=False) - child_names = tbl_w_meta.child_names - per_file_user_data = tbl_w_meta.per_file_user_data - concatenated_columns = tbl_w_meta.tbl.columns() - - # save memory - del tbl_w_meta - - cdef Table tbl - while reader.has_next(): - tbl = reader.read_chunk().tbl - - for i in range(tbl.num_columns()): - concatenated_columns[i] = plc.concatenate.concatenate( - [concatenated_columns[i], tbl._columns[i]] - ) - # Drop residual columns to save memory - tbl._columns[i] = None - - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns], - column_names=column_names, - index_names=None - ) - ) - df = _process_metadata(df, column_names, child_names, - per_file_user_data, row_groups, - filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - - -cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - use_pandas_metadata=True, - Expression filters=None, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False): - """ - Cython function to call into libcudf API, see `read_parquet`. - - filters, if not None, should be an Expression that evaluates to a - boolean predicate as a function of columns being read. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - - allow_range_index = True - if columns is not None and len(columns) == 0 or filters: - allow_range_index = False - - options = ( - plc.io.parquet.ParquetReaderOptions.builder( - plc.io.SourceInfo(filepaths_or_buffers) - ) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .build() - ) - if row_groups is not None: - options.set_row_groups(row_groups) - if nrows > -1: - options.set_num_rows(nrows) - if skip_rows != 0: - options.set_skip_rows(skip_rows) - if columns is not None: - options.set_columns(columns) - if filters is not None: - options.set_filter(filters) - - tbl_w_meta = plc.io.parquet.read_parquet(options) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(tbl_w_meta) - ) - - df = _process_metadata(df, tbl_w_meta.column_names(include_children=False), - tbl_w_meta.child_names, tbl_w_meta.per_file_user_data, - row_groups, filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - -cpdef read_parquet_metadata(list filepaths_or_buffers): - """ - Cython function to call into libcudf API, see `read_parquet_metadata`. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( - plc.io.SourceInfo(filepaths_or_buffers) - ) - - # read all column names including index column, if any - col_names = [info.name() for info in parquet_metadata.schema().root().children()] - - index_col_names = set() - json_str = parquet_metadata.metadata()['pandas'] - if json_str != "": - meta = json.loads(json_str) - file_is_range_index, index_col, _ = _parse_metadata(meta) - if ( - not file_is_range_index - and index_col is not None - ): - columns = meta['columns'] - for idx_col in index_col: - for c in columns: - if c['field_name'] == idx_col: - index_col_names.add(idx_col) - - # remove the index column from the list of column names - # only if index_col_names is not None - if len(index_col_names) >= 0: - col_names = [name for name in col_names if name not in index_col_names] - - return ( - parquet_metadata.num_rows(), - parquet_metadata.num_rowgroups(), - col_names, - len(col_names), - parquet_metadata.rowgroup_metadata() - ) - - -@acquire_spill_lock() -def write_parquet( - table, - object filepaths_or_buffers, - object index=None, - object compression="snappy", - object statistics="ROWGROUP", - object metadata_file_path=None, - object int96_timestamps=False, - object row_group_size_bytes=None, - object row_group_size_rows=None, - object max_page_size_bytes=None, - object max_page_size_rows=None, - object max_dictionary_size=None, - object partitions_info=None, - object force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, - write_arrow_schema=False, -): - """ - Cython function to call into libcudf API, see `write_parquet`. - - See Also - -------- - cudf.io.parquet.write_parquet - """ - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - columns = [*table.index._columns, *table._columns] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) - ) - num_index_cols_meta = len(table._index.names) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - tbl_meta = TableInputMetadata(plc_table) - num_index_cols_meta = 0 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - if not isinstance(name, str): - if cudf.get_option("mode.pandas_compatible"): - tbl_meta.column_metadata[i].set_name(str(name)) - else: - raise ValueError( - "Writing a Parquet file requires string column names" - ) - else: - tbl_meta.column_metadata[i].set_name(name) - - _set_col_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - force_nullable_schema, - None, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - if partitions_info is not None: - user_data = [ - {"pandas": generate_pandas_metadata( - table.iloc[start_row:start_row + num_row].copy(deep=False), - index - )} - for start_row, num_row in partitions_info - ] - else: - user_data = [{"pandas": generate_pandas_metadata(table, index)}] - - if header_version not in ("1.0", "2.0"): - raise ValueError( - f"Invalid parquet header version: {header_version}. " - "Valid values are '1.0' and '2.0'" - ) - - dict_policy = ( - plc.io.types.DictionaryPolicy.ADAPTIVE - if use_dictionary - else plc.io.types.DictionaryPolicy.NEVER - ) - - comp_type = _get_comp_type(compression) - stat_freq = _get_stat_freq(statistics) - options = ( - plc.io.parquet.ParquetWriterOptions.builder( - plc.io.SinkInfo(filepaths_or_buffers), plc_table - ) - .metadata(tbl_meta) - .key_value_metadata(user_data) - .compression(comp_type) - .stats_level(stat_freq) - .int96_timestamps(int96_timestamps) - .write_v2_headers(header_version == "2.0") - .dictionary_policy(dict_policy) - .utc_timestamps(False) - .write_arrow_schema(write_arrow_schema) - .build() - ) - if partitions_info is not None: - options.set_partitions( - [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info] - ) - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - options.set_column_chunks_file_paths(metadata_file_path) - else: - options.set_column_chunks_file_paths([metadata_file_path]) - if row_group_size_bytes is not None: - options.set_row_group_size_bytes(row_group_size_bytes) - if row_group_size_rows is not None: - options.set_row_group_size_rows(row_group_size_rows) - if max_page_size_bytes is not None: - options.set_max_page_size_bytes(max_page_size_bytes) - if max_page_size_rows is not None: - options.set_max_page_size_rows(max_page_size_rows) - if max_dictionary_size is not None: - options.set_max_dictionary_size(max_dictionary_size) - blob = plc.io.parquet.write_parquet(options) - if metadata_file_path is not None: - return np.asarray(blob.obj) - else: - return None - - -cdef class ParquetWriter: - """ - ParquetWriter lets you incrementally write out a Parquet file from a series - of cudf tables - - Parameters - ---------- - filepath_or_buffer : str, io.IOBase, os.PathLike, or list - File path or buffer to write to. The argument may also correspond - to a list of file paths or buffers. - index : bool or None, default None - If ``True``, include a dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. If ``None``, - index(es) other than RangeIndex will be saved as columns. - compression : {'snappy', None}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' - Level at which column statistics should be included in file. - row_group_size_bytes: int, default ``uint64 max`` - Maximum size of each stripe of the output. - By default, a virtually infinite size equal to ``uint64 max`` will be used. - row_group_size_rows: int, default 1000000 - Maximum number of rows of each stripe of the output. - By default, 1000000 (10^6 rows) will be used. - max_page_size_bytes: int, default 524288 - Maximum uncompressed size of each page of the output. - By default, 524288 (512KB) will be used. - max_page_size_rows: int, default 20000 - Maximum number of rows of each page of the output. - By default, 20000 will be used. - max_dictionary_size: int, default 1048576 - Maximum size of the dictionary page for each output column chunk. Dictionary - encoding for column chunks that exceeds this limit will be disabled. - By default, 1048576 (1MB) will be used. - use_dictionary : bool, default True - If ``True``, enable dictionary encoding for Parquet page data - subject to ``max_dictionary_size`` constraints. - If ``False``, disable dictionary encoding for Parquet page data. - store_schema : bool, default False - If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section for faithful round-tripping. - See Also - -------- - cudf.io.parquet.write_parquet - """ - cdef bool initialized - cdef ParquetChunkedWriter writer - cdef SinkInfo sink - cdef TableInputMetadata tbl_meta - cdef str statistics - cdef object compression - cdef object index - cdef size_t row_group_size_bytes - cdef size_type row_group_size_rows - cdef size_t max_page_size_bytes - cdef size_type max_page_size_rows - cdef size_t max_dictionary_size - cdef bool use_dictionary - cdef bool write_arrow_schema - - def __cinit__(self, object filepath_or_buffer, object index=None, - object compression="snappy", str statistics="ROWGROUP", - size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - size_type row_group_size_rows=1000000, - size_t max_page_size_bytes=524288, - size_type max_page_size_rows=20000, - size_t max_dictionary_size=1048576, - bool use_dictionary=True, - bool store_schema=False): - filepaths_or_buffers = ( - list(filepath_or_buffer) - if is_list_like(filepath_or_buffer) - else [filepath_or_buffer] - ) - self.sink = plc.io.SinkInfo(filepaths_or_buffers) - self.statistics = statistics - self.compression = compression - self.index = index - self.initialized = False - self.row_group_size_bytes = row_group_size_bytes - self.row_group_size_rows = row_group_size_rows - self.max_page_size_bytes = max_page_size_bytes - self.max_page_size_rows = max_page_size_rows - self.max_dictionary_size = max_dictionary_size - self.use_dictionary = use_dictionary - self.write_arrow_schema = store_schema - - def write_table(self, table, object partitions_info=None): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state( - table, - num_partitions=len(partitions_info) if partitions_info else 1 - ) - if self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex)): - columns = [*table.index._columns, *table._columns] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - self.writer.write(plc_table, partitions_info) - - def close(self, object metadata_file_path=None): - if not self.initialized: - return None - column_chunks_file_paths=[] - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - column_chunks_file_paths = list(metadata_file_path) - else: - column_chunks_file_paths = [metadata_file_path] - blob = self.writer.close(column_chunks_file_paths) - if metadata_file_path is not None: - return np.asarray(blob.obj) - return None - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def _initialize_chunked_state(self, table, num_partitions=1): - """ Prepares all the values required to build the - chunked_parquet_writer_options and creates a writer""" - - # Set the table_metadata - num_index_cols_meta = 0 - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in table._columns - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name(idx_name) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain( - table.index._columns, table._columns - ) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - self.tbl_meta.column_metadata[0].set_name(table._index.name) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name) - _set_col_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - ) - - index = ( - False if isinstance(table._index, cudf.RangeIndex) else self.index - ) - user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions - cdef compression_type comp_type = _get_comp_type(self.compression) - cdef statistics_freq stat_freq = _get_stat_freq(self.statistics) - cdef dictionary_policy dict_policy = ( - plc.io.types.DictionaryPolicy.ADAPTIVE - if self.use_dictionary - else plc.io.types.DictionaryPolicy.NEVER - ) - options = ( - plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(user_data) - .compression(comp_type) - .stats_level(stat_freq) - .row_group_size_bytes(self.row_group_size_bytes) - .row_group_size_rows(self.row_group_size_rows) - .max_page_size_bytes(self.max_page_size_bytes) - .max_page_size_rows(self.max_page_size_rows) - .max_dictionary_size(self.max_dictionary_size) - .write_arrow_schema(self.write_arrow_schema) - .build() - ) - options.set_dictionary_policy(dict_policy) - self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) - self.initialized = True - - -cpdef merge_filemetadata(object filemetadata_list): - """ - Cython function to call into libcudf API, see `merge_row_group_metadata`. - - See Also - -------- - cudf.io.parquet.merge_row_group_metadata - """ - return np.asarray( - plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj - ) - - -cdef statistics_freq _get_stat_freq(str statistics): - result = getattr( - plc.io.types.StatisticsFreq, - f"STATISTICS_{statistics.upper()}", - None - ) - if result is None: - raise ValueError("Unsupported `statistics_freq` type") - return result - - -cdef compression_type _get_comp_type(object compression): - if compression is None: - return plc.io.types.CompressionType.NONE - result = getattr( - plc.io.types.CompressionType, - str(compression).upper(), - None - ) - if result is None: - raise ValueError("Unsupported `compression` type") - return result - - -cdef _set_col_metadata( - Column col, - ColumnInMetadata col_meta, - bool force_nullable_schema=False, - str path=None, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, -): - need_path = (skip_compression is not None or column_encoding is not None or - column_type_length is not None or output_as_binary is not None) - name = col_meta.get_name() if need_path else None - full_path = path + "." + name if path is not None else name - - if force_nullable_schema: - # Only set nullability if `force_nullable_schema` - # is true. - col_meta.set_nullability(True) - - if skip_compression is not None and full_path in skip_compression: - col_meta.set_skip_compression(True) - - if column_encoding is not None and full_path in column_encoding: - encoding = column_encoding[full_path] - if encoding is None: - c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT - else: - enc = str(encoding).upper() - c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) - if c_encoding is None: - raise ValueError("Unsupported `column_encoding` type") - col_meta.set_encoding(c_encoding) - - if column_type_length is not None and full_path in column_type_length: - col_meta.set_output_as_binary(True) - col_meta.set_type_length(column_type_length[full_path]) - - if output_as_binary is not None and full_path in output_as_binary: - col_meta.set_output_as_binary(True) - - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name) - _set_col_metadata( - child_col, - col_meta.child(i), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.ListDtype): - if full_path is not None: - full_path = full_path + ".list" - col_meta.child(1).set_name("element") - _set_col_metadata( - col.children[1], - col_meta.child(1), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): - col_meta.set_decimal_precision(col.dtype.precision) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 2382e9f12ed..66095d4a155 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations +import io import itertools import math import operator @@ -10,23 +11,42 @@ from collections import defaultdict from contextlib import ExitStack from functools import partial, reduce -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Literal from uuid import uuid4 import numpy as np import pandas as pd +import pyarrow as pa from pyarrow import dataset as ds +import pylibcudf as plc + import cudf -from cudf._lib import parquet as libparquet +from cudf._lib.column import Column +from cudf._lib.utils import ( + _data_from_columns, + _index_level_name, + data_from_pylibcudf_io, + generate_pandas_metadata, +) from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column, column_empty from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking +try: + import ujson as json # type: ignore[import-untyped] +except ImportError: + import json + if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Hashable + + from typing_extensions import Self + + from cudf.core.column import ColumnBase BYTE_SIZES = { @@ -55,31 +75,200 @@ } +@acquire_spill_lock() +def _plc_write_parquet( + table, + filepaths_or_buffers, + index: bool | None = None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, + partitions_info=None, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, + write_arrow_schema: bool = False, +) -> np.ndarray | None: + """ + Cython function to call into libcudf API, see `write_parquet`. + + See Also + -------- + cudf.io.parquet.write_parquet + """ + if index is True or ( + index is None and not isinstance(table.index, cudf.RangeIndex) + ): + columns = itertools.chain(table.index._columns, table._columns) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + tbl_meta.column_metadata[level].set_name( + _index_level_name(idx_name, level, table._column_names) + ) + num_index_cols_meta = len(table.index.names) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + num_index_cols_meta = 0 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + if cudf.get_option("mode.pandas_compatible"): + tbl_meta.column_metadata[i].set_name(str(name)) + else: + raise ValueError( + "Writing a Parquet file requires string column names" + ) + else: + tbl_meta.column_metadata[i].set_name(name) + + _set_col_metadata( + table[name]._column, + tbl_meta.column_metadata[i], + force_nullable_schema, + None, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + if partitions_info is not None: + user_data = [ + { + "pandas": generate_pandas_metadata( + table.iloc[start_row : start_row + num_row].copy( + deep=False + ), + index, + ) + } + for start_row, num_row in partitions_info + ] + else: + user_data = [{"pandas": generate_pandas_metadata(table, index)}] + + if header_version not in ("1.0", "2.0"): + raise ValueError( + f"Invalid parquet header version: {header_version}. " + "Valid values are '1.0' and '2.0'" + ) + + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) + + comp_type = _get_comp_type(compression) + stat_freq = _get_stat_freq(statistics) + options = ( + plc.io.parquet.ParquetWriterOptions.builder( + plc.io.SinkInfo(filepaths_or_buffers), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .int96_timestamps(int96_timestamps) + .write_v2_headers(header_version == "2.0") + .dictionary_policy(dict_policy) + .utc_timestamps(False) + .write_arrow_schema(write_arrow_schema) + .build() + ) + if partitions_info is not None: + options.set_partitions( + [ + plc.io.types.PartitionInfo(part[0], part[1]) + for part in partitions_info + ] + ) + if metadata_file_path is not None: + if is_list_like(metadata_file_path): + options.set_column_chunks_file_paths(metadata_file_path) + else: + options.set_column_chunks_file_paths([metadata_file_path]) + if row_group_size_bytes is not None: + options.set_row_group_size_bytes(row_group_size_bytes) + if row_group_size_rows is not None: + options.set_row_group_size_rows(row_group_size_rows) + if max_page_size_bytes is not None: + options.set_max_page_size_bytes(max_page_size_bytes) + if max_page_size_rows is not None: + options.set_max_page_size_rows(max_page_size_rows) + if max_dictionary_size is not None: + options.set_max_dictionary_size(max_dictionary_size) + blob = plc.io.parquet.write_parquet(options) + if metadata_file_path is not None: + return np.asarray(blob.obj) + else: + return None + + @_performance_tracking def _write_parquet( df, paths, - compression="snappy", - index=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + index: bool | None = None, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, partitions_info=None, storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - write_arrow_schema=True, -): + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, + write_arrow_schema: bool = True, +) -> np.ndarray | None: if is_list_like(paths) and len(paths) > 1: if partitions_info is None: ValueError("partition info is required for multiple paths") @@ -124,11 +313,11 @@ def _write_parquet( file_objs = [ ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs ] - write_parquet_res = libparquet.write_parquet( + write_parquet_res = _plc_write_parquet( df, filepaths_or_buffers=file_objs, **common_args ) else: - write_parquet_res = libparquet.write_parquet( + write_parquet_res = _plc_write_parquet( df, filepaths_or_buffers=paths_or_bufs, **common_args ) @@ -141,26 +330,38 @@ def _write_parquet( def write_to_dataset( df, root_path, - compression="snappy", + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", filename=None, partition_cols=None, fs=None, - preserve_index=False, - return_metadata=False, - statistics="ROWGROUP", - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, + preserve_index: bool = False, + return_metadata: bool = False, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, store_schema=False, ): """Wraps `to_parquet` to write partitioned Parquet datasets. @@ -330,9 +531,29 @@ def write_to_dataset( return metadata +def _parse_metadata(meta) -> tuple[bool, Any, Any]: + file_is_range_index = False + file_index_cols = None + file_column_dtype = None + + if "index_columns" in meta and len(meta["index_columns"]) > 0: + file_index_cols = meta["index_columns"] + + if ( + isinstance(file_index_cols[0], dict) + and file_index_cols[0]["kind"] == "range" + ): + file_is_range_index = True + if "column_indexes" in meta and len(meta["column_indexes"]) == 1: + file_column_dtype = meta["column_indexes"][0]["numpy_type"] + return file_is_range_index, file_index_cols, file_column_dtype + + @ioutils.doc_read_parquet_metadata() @_performance_tracking -def read_parquet_metadata(filepath_or_buffer): +def read_parquet_metadata( + filepath_or_buffer, +) -> tuple[int, int, list[Hashable], int, list[dict[str, int]]]: """{docstring}""" # List of filepaths or buffers @@ -341,7 +562,39 @@ def read_parquet_metadata(filepath_or_buffer): bytes_per_thread=None, ) - return libparquet.read_parquet_metadata(filepaths_or_buffers) + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) + + # read all column names including index column, if any + col_names = [ + info.name() for info in parquet_metadata.schema().root().children() + ] + + index_col_names = set() + json_str = parquet_metadata.metadata()["pandas"] + if json_str != "": + meta = json.loads(json_str) + file_is_range_index, index_col, _ = _parse_metadata(meta) + if not file_is_range_index and index_col is not None: + columns = meta["columns"] + for idx_col in index_col: + for c in columns: + if c["field_name"] == idx_col: + index_col_names.add(idx_col) + + # remove the index column from the list of column names + # only if index_col_names is not None + if len(index_col_names) >= 0: + col_names = [name for name in col_names if name not in index_col_names] + + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata(), + ) @_performance_tracking @@ -913,16 +1166,18 @@ def _read_parquet( columns=None, row_groups=None, use_pandas_metadata=None, - nrows=None, - skip_rows=None, - allow_mismatched_pq_schemas=False, + nrows: int | None = None, + skip_rows: int | None = None, + allow_mismatched_pq_schemas: bool = False, *args, **kwargs, -): +) -> cudf.DataFrame: # Simple helper function to dispatch between # cudf and pyarrow to read parquet data if engine == "cudf": - if kwargs: + if set(kwargs.keys()).difference( + set(("_chunk_read_limit", "_pass_read_limit")) + ): raise ValueError( "cudf engine doesn't support the " f"following keyword arguments: {list(kwargs.keys())}" @@ -932,30 +1187,123 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) + if nrows is None: + nrows = -1 + if skip_rows is None: + skip_rows = 0 if cudf.get_option("io.parquet.low_memory"): - return libparquet.read_parquet_chunked( + # Note: If this function ever takes accepts filters + # allow_range_index needs to be False when a filter is passed + # (see read_parquet) + allow_range_index = columns is not None and len(columns) != 0 + + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + + reader = plc.io.parquet.ChunkedParquetReader( + options, + chunk_read_limit=kwargs.get("_chunk_read_limit", 0), + pass_read_limit=kwargs.get("_pass_read_limit", 1024000000), + ) + + tbl_w_meta = reader.read_chunk() + column_names = tbl_w_meta.column_names(include_children=False) + child_names = tbl_w_meta.child_names + per_file_user_data = tbl_w_meta.per_file_user_data + concatenated_columns = tbl_w_meta.tbl.columns() + + # save memory + del tbl_w_meta + + while reader.has_next(): + tbl = reader.read_chunk().tbl + + for i in range(tbl.num_columns()): + concatenated_columns[i] = plc.concatenate.concatenate( + [concatenated_columns[i], tbl._columns[i]] + ) + # Drop residual columns to save memory + tbl._columns[i] = None + + df = cudf.DataFrame._from_data( + *_data_from_columns( + columns=[ + Column.from_pylibcudf(plc) + for plc in concatenated_columns + ], + column_names=column_names, + index_names=None, + ) + ) + df = _process_metadata( + df, + column_names, + child_names, + per_file_user_data, + row_groups, filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - nrows=nrows if nrows is not None else -1, - skip_rows=skip_rows if skip_rows is not None else 0, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, + allow_range_index, + use_pandas_metadata, + nrows=nrows, + skip_rows=skip_rows, ) + return df else: - if nrows is None: - nrows = -1 - if skip_rows is None: - skip_rows = 0 - return libparquet.read_parquet( + allow_range_index = True + filters = kwargs.get("filters", None) + if columns is not None and len(columns) == 0 or filters: + allow_range_index = False + + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + if filters is not None: + options.set_filter(filters) + + tbl_w_meta = plc.io.parquet.read_parquet(options) + + df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta)) + + df = _process_metadata( + df, + tbl_w_meta.column_names(include_children=False), + tbl_w_meta.child_names, + tbl_w_meta.per_file_user_data, + row_groups, filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, + allow_range_index, + use_pandas_metadata, nrows=nrows, skip_rows=skip_rows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, ) + return df else: if ( isinstance(filepaths_or_buffers, list) @@ -980,28 +1328,40 @@ def to_parquet( df, path, engine="cudf", - compression="snappy", - index=None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + index: bool | None = None, partition_cols=None, partition_file_name=None, partition_offsets=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, storage_options=None, - return_metadata=False, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, + return_metadata: bool = False, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, store_schema=False, *args, **kwargs, @@ -1114,10 +1474,11 @@ def to_parquet( @ioutils.doc_merge_parquet_filemetadata() -def merge_parquet_filemetadata(filemetadata_list): +def merge_parquet_filemetadata(filemetadata_list: list) -> np.ndarray: """{docstring}""" - - return libparquet.merge_filemetadata(filemetadata_list) + return np.asarray( + plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj + ) def _generate_filename(): @@ -1205,10 +1566,207 @@ def _get_groups_and_offsets( return part_names, grouped_df, part_offsets -ParquetWriter = libparquet.ParquetWriter +class ParquetWriter: + """ + ParquetWriter lets you incrementally write out a Parquet file from a series + of cudf tables + + Parameters + ---------- + filepath_or_buffer : str, io.IOBase, os.PathLike, or list + File path or buffer to write to. The argument may also correspond + to a list of file paths or buffers. + index : bool or None, default None + If ``True``, include a dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. If ``None``, + index(es) other than RangeIndex will be saved as columns. + compression : {'snappy', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' + Level at which column statistics should be included in file. + row_group_size_bytes: int, default ``uint64 max`` + Maximum size of each stripe of the output. + By default, a virtually infinite size equal to ``uint64 max`` will be used. + row_group_size_rows: int, default 1000000 + Maximum number of rows of each stripe of the output. + By default, 1000000 (10^6 rows) will be used. + max_page_size_bytes: int, default 524288 + Maximum uncompressed size of each page of the output. + By default, 524288 (512KB) will be used. + max_page_size_rows: int, default 20000 + Maximum number of rows of each page of the output. + By default, 20000 will be used. + max_dictionary_size: int, default 1048576 + Maximum size of the dictionary page for each output column chunk. Dictionary + encoding for column chunks that exceeds this limit will be disabled. + By default, 1048576 (1MB) will be used. + use_dictionary : bool, default True + If ``True``, enable dictionary encoding for Parquet page data + subject to ``max_dictionary_size`` constraints. + If ``False``, disable dictionary encoding for Parquet page data. + store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. + + See Also + -------- + cudf.io.parquet.write_parquet + """ + + def __init__( + self, + filepath_or_buffer, + index: bool | None = None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + row_group_size_bytes: int = int(np.iinfo(np.uint64).max), + row_group_size_rows: int = 1000000, + max_page_size_bytes: int = 524288, + max_page_size_rows: int = 20000, + max_dictionary_size: int = 1048576, + use_dictionary: bool = True, + store_schema: bool = False, + ): + filepaths_or_buffers = ( + list(filepath_or_buffer) + if is_list_like(filepath_or_buffer) + else [filepath_or_buffer] + ) + self.sink = plc.io.SinkInfo(filepaths_or_buffers) + self.statistics = statistics + self.compression = compression + self.index = index + self.initialized = False + self.row_group_size_bytes = row_group_size_bytes + self.row_group_size_rows = row_group_size_rows + self.max_page_size_bytes = max_page_size_bytes + self.max_page_size_rows = max_page_size_rows + self.max_dictionary_size = max_dictionary_size + self.use_dictionary = use_dictionary + self.write_arrow_schema = store_schema + + def write_table(self, table, partitions_info=None) -> None: + """Writes a single table to the file""" + if not self.initialized: + self._initialize_chunked_state( + table, + num_partitions=len(partitions_info) if partitions_info else 1, + ) + if self.index is not False and ( + table.index.name is not None + or isinstance(table.index, cudf.MultiIndex) + ): + columns = itertools.chain(table.index._columns, table._columns) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.writer.write(plc_table, partitions_info) + + def close(self, metadata_file_path=None) -> np.ndarray | None: + if not self.initialized: + return None + column_chunks_file_paths = [] + if metadata_file_path is not None: + if is_list_like(metadata_file_path): + column_chunks_file_paths = list(metadata_file_path) + else: + column_chunks_file_paths = [metadata_file_path] + blob = self.writer.close(column_chunks_file_paths) + if metadata_file_path is not None: + return np.asarray(blob.obj) + return None + + def __enter__(self) -> Self: + return self + + def __exit__(self, *args) -> None: + self.close() + + def _initialize_chunked_state( + self, table, num_partitions: int = 1 + ) -> None: + """Prepares all the values required to build the + chunked_parquet_writer_options and creates a writer + """ + # Set the table_metadata + num_index_cols_meta = 0 + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + if self.index is not False: + if isinstance(table.index, cudf.MultiIndex): + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + self.tbl_meta.column_metadata[level].set_name(idx_name) + num_index_cols_meta = len(table.index.names) + else: + if table.index.name is not None: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + self.tbl_meta.column_metadata[0].set_name(table.index.name) + num_index_cols_meta = 1 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + self.tbl_meta.column_metadata[i].set_name(name) + _set_col_metadata( + table[name]._column, + self.tbl_meta.column_metadata[i], + ) -def _parse_bytes(s): + index = ( + False if isinstance(table.index, cudf.RangeIndex) else self.index + ) + user_data = [ + {"pandas": generate_pandas_metadata(table, index)} + ] * num_partitions + comp_type = _get_comp_type(self.compression) + stat_freq = _get_stat_freq(self.statistics) + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if self.use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) + options = ( + plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .row_group_size_bytes(self.row_group_size_bytes) + .row_group_size_rows(self.row_group_size_rows) + .max_page_size_bytes(self.max_page_size_bytes) + .max_page_size_rows(self.max_page_size_rows) + .max_dictionary_size(self.max_dictionary_size) + .write_arrow_schema(self.write_arrow_schema) + .build() + ) + options.set_dictionary_policy(dict_policy) + self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) + self.initialized = True + + +def _parse_bytes(s: str) -> int: """Parse byte string to numbers Utility function vendored from Dask. @@ -1345,8 +1903,8 @@ def __init__( path, partition_cols, index=None, - compression="snappy", - statistics="ROWGROUP", + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", max_file_size=None, file_name_prefix=None, storage_options=None, @@ -1370,9 +1928,7 @@ def __init__( self.partition_cols = partition_cols # Collection of `ParquetWriter`s, and the corresponding # partition_col values they're responsible for - self._chunked_writers: list[ - tuple[libparquet.ParquetWriter, list[str], str] - ] = [] + self._chunked_writers: list[tuple[ParquetWriter, list[str], str]] = [] # Map of partition_col values to their ParquetWriter's index # in self._chunked_writers for reverse lookup self.path_cw_map: dict[str, int] = {} @@ -1563,3 +2119,257 @@ def _hive_dirname(name, val): if pd.isna(val): val = "__HIVE_DEFAULT_PARTITION__" return f"{name}={val}" + + +def _set_col_metadata( + col: ColumnBase, + col_meta: plc.io.types.ColumnInMetadata, + force_nullable_schema: bool = False, + path: str | None = None, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, +) -> None: + need_path = ( + skip_compression is not None + or column_encoding is not None + or column_type_length is not None + or output_as_binary is not None + ) + name = col_meta.get_name() if need_path else None + full_path = ( + path + "." + name if (path is not None and name is not None) else name + ) + + if force_nullable_schema: + # Only set nullability if `force_nullable_schema` + # is true. + col_meta.set_nullability(True) + + if skip_compression is not None and full_path in skip_compression: + col_meta.set_skip_compression(True) + + if column_encoding is not None and full_path in column_encoding: + encoding = column_encoding[full_path] + if encoding is None: + c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT + else: + enc = str(encoding).upper() + c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) + if c_encoding is None: + raise ValueError("Unsupported `column_encoding` type") + col_meta.set_encoding(c_encoding) + + if column_type_length is not None and full_path in column_type_length: + col_meta.set_output_as_binary(True) + col_meta.set_type_length(column_type_length[full_path]) + + if output_as_binary is not None and full_path in output_as_binary: + col_meta.set_output_as_binary(True) + + if isinstance(col.dtype, cudf.StructDtype): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name) + _set_col_metadata( + child_col, + col_meta.child(i), + force_nullable_schema, + full_path, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + elif isinstance(col.dtype, cudf.ListDtype): + if full_path is not None: + full_path = full_path + ".list" + col_meta.child(1).set_name("element") + _set_col_metadata( + col.children[1], + col_meta.child(1), + force_nullable_schema, + full_path, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): + col_meta.set_decimal_precision(col.dtype.precision) + + +def _get_comp_type( + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None], +) -> plc.io.types.CompressionType: + if compression is None: + return plc.io.types.CompressionType.NONE + result = getattr(plc.io.types.CompressionType, compression.upper(), None) + if result is None: + raise ValueError("Unsupported `compression` type") + return result + + +def _get_stat_freq( + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"], +) -> plc.io.types.StatisticsFreq: + result = getattr( + plc.io.types.StatisticsFreq, f"STATISTICS_{statistics.upper()}", None + ) + if result is None: + raise ValueError("Unsupported `statistics_freq` type") + return result + + +def _process_metadata( + df: cudf.DataFrame, + names: list[Hashable], + child_names: dict, + per_file_user_data: list, + row_groups, + filepaths_or_buffers, + allow_range_index: bool, + use_pandas_metadata: bool, + nrows: int = -1, + skip_rows: int = 0, +) -> cudf.DataFrame: + ioutils._add_df_col_struct_names(df, child_names) + index_col = None + is_range_index = True + column_index_type = None + index_col_names = None + meta = None + for single_file in per_file_user_data: + if b"pandas" not in single_file: + continue + json_str = single_file[b"pandas"].decode("utf-8") + meta = json.loads(json_str) + file_is_range_index, index_col, column_index_type = _parse_metadata( + meta + ) + is_range_index &= file_is_range_index + + if ( + not file_is_range_index + and index_col is not None + and index_col_names is None + ): + index_col_names = {} + for idx_col in index_col: + for c in meta["columns"]: + if c["field_name"] == idx_col: + index_col_names[idx_col] = c["name"] + + if meta is not None: + # Book keep each column metadata as the order + # of `meta["columns"]` and `column_names` are not + # guaranteed to be deterministic and same always. + meta_data_per_column = { + col_meta["name"]: col_meta for col_meta in meta["columns"] + } + + # update the decimal precision of each column + for col in names: + if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): + df._data[col].dtype.precision = meta_data_per_column[col][ + "metadata" + ]["precision"] + + # Set the index column + if index_col is not None and len(index_col) > 0: + if is_range_index: + if not allow_range_index: + return df + + if len(per_file_user_data) > 1: + range_index_meta = { + "kind": "range", + "name": None, + "start": 0, + "stop": len(df), + "step": 1, + } + else: + range_index_meta = index_col[0] + + if row_groups is not None: + per_file_metadata = [ + pa.parquet.read_metadata( + # Pyarrow cannot read directly from bytes + io.BytesIO(s) if isinstance(s, bytes) else s + ) + for s in filepaths_or_buffers + ] + + filtered_idx = [] + for i, file_meta in enumerate(per_file_metadata): + row_groups_i = [] + start = 0 + for row_group in range(file_meta.num_row_groups): + stop = start + file_meta.row_group(row_group).num_rows + row_groups_i.append((start, stop)) + start = stop + + for rg in row_groups[i]: + filtered_idx.append( + cudf.RangeIndex( + start=row_groups_i[rg][0], + stop=row_groups_i[rg][1], + step=range_index_meta["step"], + ) + ) + + if len(filtered_idx) > 0: + idx = cudf.concat(filtered_idx) + else: + idx = cudf.Index._from_column( + cudf.core.column.column_empty(0) + ) + else: + start = range_index_meta["start"] + skip_rows # type: ignore[operator] + stop = range_index_meta["stop"] + if nrows > -1: + stop = start + nrows + idx = cudf.RangeIndex( + start=start, + stop=stop, + step=range_index_meta["step"], + name=range_index_meta["name"], + ) + + df.index = idx + elif set(index_col).issubset(names): + index_data = df[index_col] + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: + idx = cudf.Index._from_column( + index_data._columns[0], name=next(actual_index_names) + ) + else: + idx = cudf.MultiIndex.from_frame( + index_data, names=list(actual_index_names) + ) + df.drop(columns=index_col, inplace=True) + df.index = idx + else: + if use_pandas_metadata: + df.index.names = index_col + + if df._num_columns == 0 and column_index_type is not None: + df._data.label_dtype = cudf.dtype(column_index_type) + + return df diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 13efa71ebae..77d1f77d30b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -22,7 +22,6 @@ from pyarrow import parquet as pq import cudf -from cudf._lib.parquet import read_parquet_chunked from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.io.parquet import ( ParquetDatasetWriter, @@ -3775,13 +3774,14 @@ def test_parquet_chunked_reader( ) buffer = BytesIO() df.to_parquet(buffer, row_group_size=10000) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - use_pandas_metadata=use_pandas_metadata, - row_groups=row_groups, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + use_pandas_metadata=use_pandas_metadata, + row_groups=row_groups, + ) expected = cudf.read_parquet( buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups ) @@ -3825,12 +3825,13 @@ def test_parquet_chunked_reader_structs( # Number of rows to read nrows = num_rows if num_rows is not None else len(df) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - nrows=nrows, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) expected = cudf.read_parquet( buffer, nrows=nrows, @@ -3877,12 +3878,13 @@ def test_parquet_chunked_reader_string_decoders( nrows = num_rows if num_rows is not None else len(df) # Check with num_rows specified - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - nrows=nrows, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) expected = cudf.read_parquet( buffer, nrows=nrows, @@ -3982,13 +3984,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema): ).reset_index(drop=True) # Read with chunked reader (filter columns not supported) - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["list", "d_list", "str"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["list", "d_list", "str"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) # Construct the expected table without filter columns expected_chunked = cudf.concat( @@ -4054,13 +4057,14 @@ def test_parquet_reader_with_mismatched_structs(): ) # Read with chunked reader - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["struct.b.b_b.b_b_a"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) got_chunked = ( cudf.Series(got_chunked["struct"]) .struct.field("b") diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d9a3da6666d..a04fcb8df7a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -43,7 +43,6 @@ } _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for From f811c383b46d7a8acc8496593e3d0caff83d6c8f Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:56:03 -0500 Subject: [PATCH 2/4] Allow large strings in nvbench strings benchmarks (#17571) Removes the 2GB limit check from the strings benchmarks and adjusts the parameters to be consistent across the benchmarks. The default parameters will still not exceed 2GB for automation purposes. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Paul Mattione (https://github.com/pmattione-nvidia) URL: https://github.com/rapidsai/cudf/pull/17571 --- cpp/benchmarks/string/case.cpp | 19 +++---- cpp/benchmarks/string/char_types.cpp | 15 +++--- cpp/benchmarks/string/contains.cpp | 13 ++--- cpp/benchmarks/string/copy_if_else.cpp | 15 +++--- cpp/benchmarks/string/copy_range.cpp | 15 +++--- cpp/benchmarks/string/count.cpp | 15 +++--- cpp/benchmarks/string/extract.cpp | 9 +--- cpp/benchmarks/string/join_strings.cpp | 15 +++--- cpp/benchmarks/string/lengths.cpp | 15 +++--- cpp/benchmarks/string/like.cpp | 9 +--- cpp/benchmarks/string/replace_re.cpp | 19 +++---- cpp/benchmarks/string/reverse.cpp | 15 +++--- cpp/benchmarks/string/slice.cpp | 9 +--- cpp/benchmarks/string/split.cpp | 15 +++--- cpp/benchmarks/string/split_re.cpp | 15 +++--- cpp/benchmarks/string/string_bench_args.hpp | 56 --------------------- 16 files changed, 80 insertions(+), 189 deletions(-) delete mode 100644 cpp/benchmarks/string/string_bench_args.hpp diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index cd4d3ca964b..9750475a079 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -24,18 +24,14 @@ void bench_case(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const max_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const encoding = state.get_string("encoding"); - if (static_cast(n_rows) * static_cast(max_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); auto col_view = column->view(); @@ -74,6 +70,7 @@ void bench_case(nvbench::state& state) NVBENCH_BENCH(bench_case) .set_name("case") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index eec9a5f54d7..abc5254392e 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -25,16 +25,12 @@ static void bench_char_types(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const api_type = state.get_string("api"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state) NVBENCH_BENCH(bench_char_types) .set_name("char_types") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index a73017dda18..e3940cbc0c7 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43" static void bench_contains(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto col = create_string_column(n_rows, row_width, hit_rate); + auto col = create_string_column(num_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; @@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state) NVBENCH_BENCH(bench_contains) .set_name("contains") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {50, 100}) // percentage .add_int64_axis("pattern", {0, 1, 2}); diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp index e06cca497c2..5a5743dfddf 100644 --- a/cpp/benchmarks/string/copy_if_else.cpp +++ b/cpp/benchmarks/string/copy_if_else.cpp @@ -25,15 +25,11 @@ static void bench_copy(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const str_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const source_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile); auto const target_table = @@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state) NVBENCH_BENCH(bench_copy) .set_name("copy_if_else") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp index af217a49195..7e7353a0e78 100644 --- a/cpp/benchmarks/string/copy_range.cpp +++ b/cpp/benchmarks/string/copy_range.cpp @@ -25,16 +25,12 @@ static void bench_copy_range(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const source_tables = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile); @@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state) NVBENCH_BENCH(bench_copy_range) .set_name("copy_range") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp index f964bc5d224..cf90e316f71 100644 --- a/cpp/benchmarks/string/count.cpp +++ b/cpp/benchmarks/string/count.cpp @@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"}; static void bench_count(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state) NVBENCH_BENCH(bench_count) .set_name("count") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("pattern", {0, 1}); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index af4fedb5799..d6866598ff4 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto groups = static_cast(state.get_int64("groups")); std::default_random_engine generator; @@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state) NVBENCH_BENCH(bench_extract) .set_name("extract") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("groups", {1, 2, 4}); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index 6dcf731ad3c..27652193b7b 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -25,15 +25,11 @@ static void bench_join(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state) NVBENCH_BENCH(bench_join) .set_name("strings_join") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index a19060ead3b..8156e19412b 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -25,15 +25,11 @@ static void bench_lengths(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state) NVBENCH_BENCH(bench_lengths) .set_name("lengths") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 105ae65cbe8..f6410aaef30 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); @@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state) NVBENCH_BENCH(bench_like) .set_name("strings_like") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {10, 25, 70, 100}); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index 4dcf1314f83..69426a2d484 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -26,18 +26,14 @@ static void bench_replace(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const rtype = state.get_string("type"); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto program = cudf::strings::regex_program::create("(\\d+)"); @@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"replace", "backref"}); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index a2676609a40..e2e914cb350 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -25,15 +25,11 @@ static void bench_reverse(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state) NVBENCH_BENCH(bench_reverse) .set_name("reverse") - .add_int64_axis("row_width", {8, 16, 32, 64, 128}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index 1898f0340b6..c828a8ed0b0 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); @@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state) NVBENCH_BENCH(bench_slice) .set_name("slice") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"position", "multi"}); diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 9ef58daf0fc..9c7c27c4f07 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -28,16 +28,12 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); cudf::string_scalar target("+"); @@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"split", "split_ws", "record", "record_ws"}); diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp index 1fdb6e67109..34a7aa96e84 100644 --- a/cpp/benchmarks/string/split_re.cpp +++ b/cpp/benchmarks/string/split_re.cpp @@ -28,17 +28,13 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto prog = cudf::strings::regex_program::create("\\d+"); data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp deleted file mode 100644 index a34026281e8..00000000000 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -/** - * @brief Generate row count and row length argument ranges for a string benchmark. - * - * Generates a series of row count and row length arguments for string benchmarks. - * Combinations of row count and row length that would exceed the maximum string character - * column data length are not generated. - * - * @param b Benchmark to update with row count and row length arguments. - * @param min_rows Minimum row count argument to generate. - * @param max_rows Maximum row count argument to generate. - * @param rows_mult Row count multiplier to generate intermediate row count arguments. - * @param min_rowlen Minimum row length argument to generate. - * @param max_rowlen Maximum row length argument to generate. - * @param rowlen_mult Row length multiplier to generate intermediate row length arguments. - */ -inline void generate_string_bench_args(benchmark::internal::Benchmark* b, - int min_rows, - int max_rows, - int rows_mult, - int min_rowlen, - int max_rowlen, - int rowlen_mult) -{ - for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } - } -} From 8a3e5f1a7af6c638397fcabf17bea9192bd799d2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:40:20 -0800 Subject: [PATCH 3/4] Remove cudf._lib.nvtext in favor of inlining pylibcudf (#17535) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17535 --- python/cudf/cudf/_lib/CMakeLists.txt | 2 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 24 -- python/cudf/cudf/_lib/nvtext/__init__.pxd | 0 python/cudf/cudf/_lib/nvtext/__init__.py | 0 .../cudf/_lib/nvtext/byte_pair_encode.pyx | 24 -- .../cudf/cudf/_lib/nvtext/edit_distance.pyx | 24 -- .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 35 -- python/cudf/cudf/_lib/nvtext/jaccard.pyx | 17 - python/cudf/cudf/_lib/nvtext/minhash.pyx | 35 -- .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 24 -- python/cudf/cudf/_lib/nvtext/normalize.pyx | 28 -- python/cudf/cudf/_lib/nvtext/replace.pyx | 52 --- python/cudf/cudf/_lib/nvtext/stemmer.pyx | 55 --- .../cudf/_lib/nvtext/subword_tokenize.pyx | 38 -- python/cudf/cudf/_lib/nvtext/tokenize.pyx | 86 ---- python/cudf/cudf/_lib/strings/__init__.pxd | 0 python/cudf/cudf/_lib/strings/__init__.py | 30 -- python/cudf/cudf/core/byte_pair_encoding.py | 13 +- python/cudf/cudf/core/column/string.py | 388 ++++++++++++++---- python/cudf/cudf/core/subword_tokenizer.py | 7 +- python/cudf/cudf/core/tokenize_vocabulary.py | 9 +- 22 files changed, 328 insertions(+), 564 deletions(-) delete mode 100644 python/cudf/cudf/_lib/nvtext/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.py delete mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/edit_distance.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/normalize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/replace.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/stemmer.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/strings/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/__init__.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f422635d22a..c2677c6d88d 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -30,5 +30,3 @@ target_include_directories(interop PUBLIC "$ letter_type.CONSONANT - VOWEL = letter_type.VOWEL - - -@acquire_spill_lock() -def porter_stemmer_measure(Column strings): - return Column.from_pylibcudf( - nvtext.stemmer.porter_stemmer_measure( - strings.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def is_letter(Column strings, - object ltype, - size_type index): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - index, - ) - ) - - -@acquire_spill_lock() -def is_letter_multi(Column strings, - object ltype, - Column indices): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - indices.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx deleted file mode 100644 index 5e0bfb74705..00000000000 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def subword_tokenize_inmem_hash( - Column strings, - object hashed_vocabulary, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, -): - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - result = nvtext.subword_tokenize.subword_tokenize( - strings.to_pylibcudf(mode="read"), - hashed_vocabulary, - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - # return the 3 tensor components - tokens = Column.from_pylibcudf(result[0]) - masks = Column.from_pylibcudf(result[1]) - metadata = Column.from_pylibcudf(result[2]) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx deleted file mode 100644 index f473c48e2f7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def _tokenize_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _tokenize_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def _count_tokens_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _count_tokens_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def character_tokenize(Column strings): - return Column.from_pylibcudf( - nvtext.tokenize.character_tokenize( - strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def detokenize(Column strings, Column indices, object py_separator): - return Column.from_pylibcudf( - nvtext.tokenize.detokenize( - strings.to_pylibcudf(mode="read"), - indices.to_pylibcudf(mode="read"), - py_separator.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def tokenize_with_vocabulary(Column strings, - object vocabulary, - object py_delimiter, - size_type default_id): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_with_vocabulary( - strings.to_pylibcudf(mode="read"), - vocabulary, - py_delimiter.device_value.c_value, - default_id - ) - ) diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py deleted file mode 100644 index b9095a22a42..00000000000 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 8d38a5f2272..b49f5154697 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.byte_pair_encode import ( - byte_pair_encoding as cpp_byte_pair_encoding, -) class BytePairEncoder: @@ -25,12 +22,12 @@ class BytePairEncoder: BytePairEncoder """ - def __init__(self, merges_pair: "cudf.Series"): + def __init__(self, merges_pair: cudf.Series) -> None: self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( merges_pair._column.to_pylibcudf(mode="read") ) - def __call__(self, text, separator: str = " ") -> cudf.Series: + def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: """ Parameters @@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series: dtype: object """ sep = cudf.Scalar(separator, dtype="str") - result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - - return cudf.Series._from_column(result) + return cudf.Series._from_column( + text._column.byte_pair_encoding(self.merge_pairs, sep) + ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06196717ce3..c021554f3bd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,7 +20,7 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast, strings as libstrings +from cudf._lib import string_casting as str_cast from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype @@ -45,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -624,7 +625,7 @@ def join( def _split_by_character(self): col = self._column.fillna("") # sanitize nulls - result_col = libstrings.character_tokenize(col) + result_col = col.character_tokenize() offset_col = col.children[0] @@ -4693,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) + return self._return_or_inplace(self._column.normalize_spaces()) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4743,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: dtype: object """ return self._return_or_inplace( - libstrings.normalize_characters(self._column, do_lower) + self._column.normalize_characters(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -4775,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: 2 goodbye dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + if isinstance(delim, Column): result = self._return_or_inplace( - libstrings._tokenize_column(self._column, delimiter), + self._column.tokenize_column(delim), retain_index=False, ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): result = self._return_or_inplace( - libstrings._tokenize_scalar(self._column, delimiter), + self._column.tokenize_scalar(delim), retain_index=False, ) else: @@ -4799,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: return result def detokenize( - self, indices: "cudf.Series", separator: str = " " + self, indices: cudf.Series, separator: str = " " ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order @@ -4829,9 +4828,9 @@ def detokenize( 2 three dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.detokenize(self._column, indices._column, separator), + self._column.detokenize(indices._column, sep), # type: ignore[arg-type] retain_index=False, ) @@ -4882,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex: 2 . dtype: object """ - result_col = libstrings.character_tokenize(self._column) + result_col = self._column.character_tokenize() if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - return cudf.Series._from_column( + return type(self._parent)._from_column( result_col, name=self._parent.name, index=index ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(result_col, name=self._parent.name) else: - return result_col + return self._return_or_inplace(result_col) def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ @@ -4919,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: 2 0 dtype: int32 """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) + if isinstance(delim, Column): return self._return_or_inplace( - libstrings._count_tokens_column(self._column, delimiter) + self._column.count_tokens_column(delim) ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): return self._return_or_inplace( - libstrings._count_tokens_scalar(self._column, delimiter) + self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) else: raise TypeError( @@ -4966,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: 2 xyz_hhh dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.generate_ngrams(self._column, n, separator), + self._column.generate_ngrams(n, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5015,7 +5012,7 @@ def character_ngrams( dtype: list """ result = self._return_or_inplace( - libstrings.generate_character_ngrams(self._column, n), + self._column.generate_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5060,7 +5057,7 @@ def hash_character_ngrams( """ result = self._return_or_inplace( - libstrings.hash_character_ngrams(self._column, n), + self._column.hash_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5098,10 +5095,10 @@ def ngrams_tokenize( 2 best_book dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter") - separator = _massage_string_arg(separator, "separator") + delim = _massage_string_arg(delimiter, "delimiter") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.ngrams_tokenize(self._column, n, delimiter, separator), + self._column.ngrams_tokenize(n, delim, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5180,10 +5177,9 @@ def replace_tokens( ) return self._return_or_inplace( - libstrings.replace_tokens( - self._column, - targets_column, - replacements_column, + self._column.replace_tokens( + targets_column, # type: ignore[arg-type] + replacements_column, # type: ignore[arg-type] cudf.Scalar(delimiter, dtype="str"), ), ) @@ -5251,8 +5247,7 @@ def filter_tokens( ) return self._return_or_inplace( - libstrings.filter_tokens( - self._column, + self._column.filter_tokens( min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -5278,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) + return self._return_or_inplace(self._column.porter_stemmer_measure()) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5313,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex: 1 False dtype: bool """ - ltype = libstrings.LetterType.CONSONANT - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(False, position) # type: ignore[arg-type] ) def is_vowel(self, position) -> SeriesOrIndex: @@ -5357,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex: 1 True dtype: bool """ - ltype = libstrings.LetterType.VOWEL - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(True, position) # type: ignore[arg-type] ) def edit_distance(self, targets) -> SeriesOrIndex: @@ -5416,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex: ) return self._return_or_inplace( - libstrings.edit_distance(self._column, targets_column) + self._column.edit_distance(targets_column) # type: ignore[arg-type] ) def edit_distance_matrix(self) -> SeriesOrIndex: @@ -5456,9 +5435,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) + return self._return_or_inplace(self._column.edit_distance_matrix()) def minhash( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int @@ -5508,7 +5485,7 @@ def minhash( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash(self._column, seed, a_column, b_column, width) + self._column.minhash(seed, a_column, b_column, width) # type: ignore[arg-type] ) def minhash64( @@ -5559,7 +5536,7 @@ def minhash64( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64(self._column, seed, a_column, b_column, width) + self._column.minhash64(seed, a_column, b_column, width) # type: ignore[arg-type] ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -5585,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: 1 0.307692 dtype: float32 """ - return self._return_or_inplace( - libstrings.jaccard_index(self._column, input._column, width), + self._column.jaccard_index(input._column, width) ) -def _massage_string_arg(value, name, allow_col=False): +def _massage_string_arg( + value, name, allow_col: bool = False +) -> StringColumn | cudf.Scalar: if isinstance(value, cudf.Scalar): return value @@ -5602,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False): if allow_col: if isinstance(value, list): - return column.as_column(value, dtype="str") + return column.as_column(value, dtype="str") # type: ignore[return-value] - if isinstance(value, Column) and is_string_dtype(value.dtype): + if isinstance(value, StringColumn): return value allowed_types.append("Column") @@ -6148,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) + @acquire_spill_lock() + def minhash( + self, + seed: np.uint32, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def minhash64( + self, + seed: np.uint64, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash64( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def jaccard_index(self, other: Self, width: int) -> NumericalColumn: + result = plc.nvtext.jaccard.jaccard_index( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + result = plc.nvtext.generate_ngrams.generate_ngrams( + self.to_pylibcudf(mode="read"), + ngrams, + separator.device_value.c_value, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def hash_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance(self, targets: Self) -> NumericalColumn: + result = plc.nvtext.edit_distance.edit_distance( + self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance_matrix(self) -> ListColumn: + result = plc.nvtext.edit_distance.edit_distance_matrix( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def byte_pair_encoding( + self, + merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.byte_pair_encode.byte_pair_encoding( + self.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def ngrams_tokenize( + self, + ngrams: int, + delimiter: cudf.Scalar, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.ngrams_tokenize.ngrams_tokenize( + self.to_pylibcudf(mode="read"), + ngrams, + delimiter.device_value.c_value, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def normalize_spaces(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_spaces( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def normalize_characters(self, do_lower: bool = True) -> Self: + return Column.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + do_lower, + ) + ) + + @acquire_spill_lock() + def replace_tokens( + self, targets: Self, replacements: Self, delimiter: cudf.Scalar + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.replace_tokens( + self.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def filter_tokens( + self, + min_token_length: int, + replacement: cudf.Scalar, + delimiter: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.filter_tokens( + self.to_pylibcudf(mode="read"), + min_token_length, + replacement.device_value.c_value, + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def porter_stemmer_measure(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.porter_stemmer_measure( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.is_letter( + self.to_pylibcudf(mode="read"), + is_vowel, + index + if isinstance(index, int) + else index.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def subword_tokenize( + self, + hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary, + max_sequence_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + ) -> tuple[ColumnBase, ColumnBase, ColumnBase]: + """ + Subword tokenizes text series by using the pre-loaded hashed vocabulary + """ + result = plc.nvtext.subword_tokenize.subword_tokenize( + self.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) + # return the 3 tensor components + tokens = type(self).from_pylibcudf(result[0]) + masks = type(self).from_pylibcudf(result[1]) + metadata = type(self).from_pylibcudf(result[2]) + return tokens, masks, metadata + + @acquire_spill_lock() + def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def tokenize_column(self, delimiters: Self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def count_tokens_column(self, delimiters: Self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def character_tokenize(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.character_tokenize( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def tokenize_with_vocabulary( + self, + vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, + delimiter: cudf.Scalar, + default_id: int, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_with_vocabulary( + self.to_pylibcudf(mode="read"), + vocabulary, + delimiter.device_value.c_value, + default_id, + ) + ) + + @acquire_spill_lock() + def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.detokenize( + self.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + separator.device_value.c_value, + ) + ) + def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index dda1f199078..479838ef2a8 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -8,10 +8,6 @@ import pylibcudf as plc -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_inmem_hash as cpp_subword_tokenize, -) - def _cast_to_appropriate_type(ar, cast_type): if cast_type == "cp": @@ -210,8 +206,7 @@ def __call__( stride = max_length - stride # behavior varies from subword_tokenize but maps with huggingface - input_ids, attention_mask, metadata = cpp_subword_tokenize( - text._column, + input_ids, attention_mask, metadata = text._column.subword_tokenize( self.vocab_file, max_sequence_length=max_length, stride=stride, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 1e31376cce8..fb8b9b3131c 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.tokenize import ( - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) class TokenizeVocabulary: @@ -20,7 +17,7 @@ class TokenizeVocabulary: Strings column of vocabulary terms """ - def __init__(self, vocabulary: "cudf.Series"): + def __init__(self, vocabulary: cudf.Series) -> None: self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( vocabulary._column.to_pylibcudf(mode="read") ) @@ -46,8 +43,8 @@ def tokenize( if delimiter is None: delimiter = "" delim = cudf.Scalar(delimiter, dtype="str") - result = cpp_tokenize_with_vocabulary( - text._column, self.vocabulary, delim, default_id + result = text._column.tokenize_with_vocabulary( + self.vocabulary, delim, default_id ) return cudf.Series._from_column(result) From 774970283bfa6ca5ac4bc0619fc8595f01b7362b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:06:44 -0800 Subject: [PATCH 4/4] Remove cudf._lib.csv in favor in inlining pylibcudf (#17485) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17485 --- python/cudf/cudf/_lib/CMakeLists.txt | 5 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/csv.pyx | 414 ------------------------ python/cudf/cudf/io/csv.py | 466 ++++++++++++++++++++++----- 4 files changed, 385 insertions(+), 501 deletions(-) delete mode 100644 python/cudf/cudf/_lib/csv.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index c2677c6d88d..b402db0443d 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,9 +12,8 @@ # the License. # ============================================================================= -set(cython_sources - column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx - stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx +set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx + stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index f86a15b932b..0299b264189 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -3,7 +3,6 @@ from . import ( copying, - csv, groupby, interop, reduce, diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx deleted file mode 100644 index 641fc18c203..00000000000 --- a/python/cudf/cudf/_lib/csv.pyx +++ /dev/null @@ -1,414 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -cimport pylibcudf.libcudf.types as libcudf_types - -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import errno -import os -from collections import abc -from io import BytesIO, StringIO - -import numpy as np -import pandas as pd - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc - -from cudf.api.types import is_hashable - -from pylibcudf.types cimport DataType - -CSV_HEX_TYPE_MAP = { - "hex": np.dtype("int64"), - "hex64": np.dtype("int64"), - "hex32": np.dtype("int32") -} - - -def validate_args( - object delimiter, - object sep, - bool delim_whitespace, - object decimal, - object thousands, - object nrows, - int skipfooter, - object byte_range, - int skiprows -): - if delim_whitespace: - if delimiter is not None: - raise ValueError("cannot set both delimiter and delim_whitespace") - if sep != ',': - raise ValueError("cannot set both sep and delim_whitespace") - - # Alias sep -> delimiter. - actual_delimiter = delimiter if delimiter else sep - - if decimal == actual_delimiter: - raise ValueError("decimal cannot be the same as delimiter") - - if thousands == actual_delimiter: - raise ValueError("thousands cannot be the same as delimiter") - - if nrows is not None and skipfooter != 0: - raise ValueError("cannot use both nrows and skipfooter parameters") - - if byte_range is not None: - if skipfooter != 0 or skiprows != 0 or nrows is not None: - raise ValueError("""cannot manually limit rows to be read when - using the byte range parameter""") - - -def read_csv( - object datasource, - object lineterminator="\n", - object quotechar='"', - int quoting=0, - bool doublequote=True, - object header="infer", - bool mangle_dupe_cols=True, - object usecols=None, - object sep=",", - object delimiter=None, - bool delim_whitespace=False, - bool skipinitialspace=False, - object names=None, - object dtype=None, - int skipfooter=0, - int skiprows=0, - bool dayfirst=False, - object compression="infer", - object thousands=None, - object decimal=".", - object true_values=None, - object false_values=None, - object nrows=None, - object byte_range=None, - bool skip_blank_lines=True, - object parse_dates=None, - object comment=None, - object na_values=None, - bool keep_default_na=True, - bool na_filter=True, - object prefix=None, - object index_col=None, -): - """ - Cython function to call into libcudf API, see `read_csv`. - - See Also - -------- - cudf.read_csv - """ - - if not isinstance(datasource, (BytesIO, StringIO, bytes)): - if not os.path.isfile(datasource): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), datasource - ) - - if isinstance(datasource, StringIO): - datasource = datasource.read().encode() - elif isinstance(datasource, str) and not os.path.isfile(datasource): - datasource = datasource.encode() - - validate_args(delimiter, sep, delim_whitespace, decimal, thousands, - nrows, skipfooter, byte_range, skiprows) - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - delimiter = str(delimiter) - - if byte_range is None: - byte_range = (0, 0) - - if compression is None: - c_compression = plc.io.types.CompressionType.NONE - else: - compression_map = { - "infer": plc.io.types.CompressionType.AUTO, - "gzip": plc.io.types.CompressionType.GZIP, - "bz2": plc.io.types.CompressionType.BZIP2, - "zip": plc.io.types.CompressionType.ZIP, - } - c_compression = compression_map[compression] - - # We need this later when setting index cols - orig_header = header - - if names is not None: - # explicitly mentioned name, so don't check header - if header is None or header == 'infer': - header = -1 - else: - header = header - names = list(names) - else: - if header is None: - header = -1 - elif header == 'infer': - header = 0 - - hex_cols = [] - - new_dtypes = [] - if dtype is not None: - if isinstance(dtype, abc.Mapping): - new_dtypes = dict() - for k, v in dtype.items(): - col_type = v - if is_hashable(v) and v in CSV_HEX_TYPE_MAP: - col_type = CSV_HEX_TYPE_MAP[v] - hex_cols.append(str(k)) - - new_dtypes[k] = _get_plc_data_type_from_dtype( - cudf.dtype(col_type) - ) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: - dtype = CSV_HEX_TYPE_MAP[dtype] - hex_cols.append(0) - - new_dtypes.append( - _get_plc_data_type_from_dtype(dtype) - ) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: - col_dtype = CSV_HEX_TYPE_MAP[col_dtype] - hex_cols.append(index) - - new_dtypes.append( - _get_plc_data_type_from_dtype(col_dtype) - ) - else: - raise ValueError( - "dtype should be a scalar/str/list-like/dict-like" - ) - options = ( - plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) - .compression(c_compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range[0]) - .byte_range_size(byte_range[1]) - .nrows(nrows if nrows is not None else -1) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(str(lineterminator)) - .quotechar(quotechar) - .decimal(decimal) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if names is not None: - options.set_names([str(name) for name in names]) - - if prefix is not None: - options.set_prefix(prefix) - - if usecols is not None: - if all(isinstance(col, int) for col in usecols): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name) for name in usecols]) - - if delimiter is not None: - options.set_delimiter(delimiter) - - if thousands is not None: - options.set_thousands(thousands) - - if comment is not None: - options.set_comment(comment) - - if parse_dates is not None: - options.set_parse_dates(list(parse_dates)) - - if hex_cols is not None: - options.set_parse_hex(list(hex_cols)) - - options.set_dtypes(new_dtypes) - - if true_values is not None: - options.set_true_values([str(val) for val in true_values]) - - if false_values is not None: - options.set_false_values([str(val) for val in false_values]) - - if na_values is not None: - options.set_na_values([str(val) for val in na_values]) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) - ) - - if dtype is not None: - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - if isinstance(cudf.dtype(v), cudf.CategoricalDtype): - df._data[str(k)] = df._data[str(k)].astype(v) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): - df = df.astype(dtype) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._column_names[index] - df._data[col_name] = df._data[col_name].astype(col_dtype) - - if names is not None and len(names) and isinstance(names[0], int): - df.columns = [int(x) for x in df._data] - elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"): - df.columns = [int(x) for x in df._column_names] - - # Set index if the index_col parameter is passed - if index_col is not None and index_col is not False: - if isinstance(index_col, int): - index_col_name = df._data.get_labels_by_index(index_col)[0] - df = df.set_index(index_col_name) - if isinstance(index_col_name, str) and \ - names is None and orig_header == "infer": - if index_col_name.startswith("Unnamed:"): - # TODO: Try to upstream it to libcudf - # csv reader in future - df._index.name = None - elif names is None: - df._index.name = index_col - else: - df = df.set_index(index_col) - - return df - - -@acquire_spill_lock() -def write_csv( - table, - object path_or_buf=None, - object sep=",", - object na_rep="", - bool header=True, - object lineterminator="\n", - int rows_per_chunk=8, - bool index=True, -): - """ - Cython function to call into libcudf API, see `write_csv`. - - See Also - -------- - cudf.to_csv - """ - index_and_not_empty = index is True and table.index is not None - columns = [ - col.to_pylibcudf(mode="read") for col in table.index._columns - ] if index_and_not_empty else [] - columns.extend(col.to_pylibcudf(mode="read") for col in table._columns) - col_names = [] - if header: - all_names = list(table.index.names) if index_and_not_empty else [] - all_names.extend( - na_rep if name is None or pd.isnull(name) - else name for name in table._column_names - ) - col_names = [ - '""' if (name in (None, '') and len(all_names) == 1) - else (str(name) if name not in (None, '') else '') - for name in all_names - ] - try: - plc.io.csv.write_csv( - ( - plc.io.csv.CsvWriterOptions.builder( - plc.io.SinkInfo([path_or_buf]), plc.Table(columns) - ) - .names(col_names) - .na_rep(na_rep) - .include_header(header) - .rows_per_chunk(rows_per_chunk) - .line_terminator(str(lineterminator)) - .inter_column_delimiter(str(sep)) - .true_value("True") - .false_value("False") - .build() - ) - ) - except OverflowError: - raise OverflowError( - f"Writing CSV file with chunksize={rows_per_chunk} failed. " - "Consider providing a smaller chunksize argument." - ) - - -cdef DataType _get_plc_data_type_from_dtype(object dtype) except *: - # TODO: Remove this work-around Dictionary types - # in libcudf are fully mapped to categorical columns: - # https://github.com/rapidsai/cudf/issues/3960 - if isinstance(dtype, cudf.CategoricalDtype): - dtype = dtype.categories.dtype - elif dtype == "category": - dtype = "str" - - if isinstance(dtype, str): - if str(dtype) == "date32": - return DataType( - libcudf_types.type_id.TIMESTAMP_DAYS - ) - elif str(dtype) in ("date", "date64"): - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[us]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - elif str(dtype) == "timestamp[s]": - return DataType( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - elif str(dtype) == "timestamp[ms]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[ns]": - return DataType( - libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - - dtype = cudf.dtype(dtype) - return dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3dc8915bfd1..da9a66f3874 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,57 +1,73 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations +import errno +import itertools +import os import warnings from collections import abc from io import BytesIO, StringIO +from typing import cast import numpy as np +import pandas as pd + +import pylibcudf as plc import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_scalar +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf._lib.utils import data_from_pylibcudf_io +from cudf.api.types import is_hashable, is_scalar +from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils from cudf.utils.dtypes import _maybe_convert_to_default_type from cudf.utils.performance_tracking import _performance_tracking +_CSV_HEX_TYPE_MAP = { + "hex": np.dtype("int64"), + "hex64": np.dtype("int64"), + "hex32": np.dtype("int32"), +} + @_performance_tracking @ioutils.doc_read_csv() def read_csv( filepath_or_buffer, - sep=",", - delimiter=None, + sep: str = ",", + delimiter: str | None = None, header="infer", names=None, index_col=None, usecols=None, prefix=None, - mangle_dupe_cols=True, + mangle_dupe_cols: bool = True, dtype=None, true_values=None, false_values=None, - skipinitialspace=False, - skiprows=0, - skipfooter=0, - nrows=None, + skipinitialspace: bool = False, + skiprows: int = 0, + skipfooter: int = 0, + nrows: int | None = None, na_values=None, - keep_default_na=True, - na_filter=True, - skip_blank_lines=True, + keep_default_na: bool = True, + na_filter: bool = True, + skip_blank_lines: bool = True, parse_dates=None, - dayfirst=False, + dayfirst: bool = False, compression="infer", - thousands=None, - decimal=".", - lineterminator="\n", - quotechar='"', - quoting=0, - doublequote=True, - comment=None, - delim_whitespace=False, - byte_range=None, + thousands: str | None = None, + decimal: str = ".", + lineterminator: str = "\n", + quotechar: str = '"', + quoting: int = 0, + doublequote: bool = True, + comment: str | None = None, + delim_whitespace: bool = False, + byte_range: list[int] | tuple[int, int] | None = None, storage_options=None, - bytes_per_thread=None, -): + bytes_per_thread: int | None = None, +) -> cudf.DataFrame: """{docstring}""" if delim_whitespace is not False: @@ -77,60 +93,225 @@ def read_csv( if na_values is not None and is_scalar(na_values): na_values = [na_values] - df = libcudf.csv.read_csv( - filepath_or_buffer, - lineterminator=lineterminator, - quotechar=quotechar, - quoting=quoting, - doublequote=doublequote, - header=header, - mangle_dupe_cols=mangle_dupe_cols, - usecols=usecols, - sep=sep, - delimiter=delimiter, - delim_whitespace=delim_whitespace, - skipinitialspace=skipinitialspace, - names=names, - dtype=dtype, - skipfooter=skipfooter, - skiprows=skiprows, - dayfirst=dayfirst, - compression=compression, - thousands=thousands, - decimal=decimal, - true_values=true_values, - false_values=false_values, - nrows=nrows, - byte_range=byte_range, - skip_blank_lines=skip_blank_lines, - parse_dates=parse_dates, - comment=comment, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - prefix=prefix, - index_col=index_col, + if not isinstance(filepath_or_buffer, (BytesIO, StringIO, bytes)): + if not os.path.isfile(filepath_or_buffer): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), filepath_or_buffer + ) + + if isinstance(filepath_or_buffer, StringIO): + filepath_or_buffer = filepath_or_buffer.read().encode() + elif isinstance(filepath_or_buffer, str) and not os.path.isfile( + filepath_or_buffer + ): + filepath_or_buffer = filepath_or_buffer.encode() + + _validate_args( + delimiter, + sep, + delim_whitespace, + decimal, + thousands, + nrows, + skipfooter, + byte_range, + skiprows, + ) + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + delimiter = str(delimiter) + + if byte_range is None: + byte_range = (0, 0) + + if compression is None: + c_compression = plc.io.types.CompressionType.NONE + else: + compression_map = { + "infer": plc.io.types.CompressionType.AUTO, + "gzip": plc.io.types.CompressionType.GZIP, + "bz2": plc.io.types.CompressionType.BZIP2, + "zip": plc.io.types.CompressionType.ZIP, + } + c_compression = compression_map[compression] + + # We need this later when setting index cols + orig_header = header + + if names is not None: + # explicitly mentioned name, so don't check header + if header is None or header == "infer": + header = -1 + else: + header = header + names = list(names) + else: + if header is None: + header = -1 + elif header == "infer": + header = 0 + + hex_cols: list[abc.Hashable] = [] + new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = [] + if dtype is not None: + if isinstance(dtype, abc.Mapping): + new_dtypes = {} + for k, col_type in dtype.items(): + if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP: + col_type = _CSV_HEX_TYPE_MAP[col_type] + hex_cols.append(str(k)) + + new_dtypes[k] = _get_plc_data_type_from_dtype( + cudf.dtype(col_type) + ) + elif cudf.api.types.is_scalar(dtype) or isinstance( + dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type) + ): + if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP: + dtype = _CSV_HEX_TYPE_MAP[dtype] + hex_cols.append(0) + + cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype)) + elif isinstance(dtype, abc.Collection): + for index, col_dtype in enumerate(dtype): + if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP: + col_dtype = _CSV_HEX_TYPE_MAP[col_dtype] + hex_cols.append(index) + + new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype)) + else: + raise ValueError( + "dtype should be a scalar/str/list-like/dict-like" + ) + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([filepath_or_buffer]) + ) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(str(lineterminator)) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) + + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + options.set_parse_dates(list(parse_dates)) + + if hex_cols is not None: + options.set_parse_hex(list(hex_cols)) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) + + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) ) + if isinstance(dtype, abc.Mapping): + for k, v in dtype.items(): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): + df._data[str(k)] = df._data[str(k)].astype(v) + elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype): + df = df.astype(dtype) + elif isinstance(dtype, abc.Collection) and not is_scalar(dtype): + for index, col_dtype in enumerate(dtype): + if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): + col_name = df._column_names[index] + df._data[col_name] = df._data[col_name].astype(col_dtype) + + if names is not None and len(names) and isinstance(names[0], int): + df.columns = [int(x) for x in df._data] + elif ( + names is None + and header == -1 + and cudf.get_option("mode.pandas_compatible") + ): + df.columns = [int(x) for x in df._column_names] + + # Set index if the index_col parameter is passed + if index_col is not None and index_col is not False: + if isinstance(index_col, int): + index_col_name = df._data.get_labels_by_index(index_col)[0] + df = df.set_index(index_col_name) + if ( + isinstance(index_col_name, str) + and names is None + and orig_header == "infer" + ): + if index_col_name.startswith("Unnamed:"): + # TODO: Try to upstream it to libcudf + # csv reader in future + df.index.name = None + elif names is None: + df.index.name = index_col + else: + df = df.set_index(index_col) + if dtype is None or isinstance(dtype, abc.Mapping): # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - unspecified_dtypes = { - name: dtype - for name, dtype in df._dtypes - if name not in specified_dtypes - } default_dtypes = {} - - for name, dt in unspecified_dtypes.items(): - if dt == np.dtype("i1"): + for name, dt in df._dtypes: + if name in specified_dtypes: + continue + elif dt == np.dtype("i1"): # csv reader reads all null column as int8. # The dtype should remain int8. default_dtypes[name] = dt else: default_dtypes[name] = _maybe_convert_to_default_type(dt) - df = df.astype(default_dtypes) + + if default_dtypes: + df = df.astype(default_dtypes) return df @@ -138,17 +319,17 @@ def read_csv( @_performance_tracking @ioutils.doc_to_csv() def to_csv( - df, + df: cudf.DataFrame, path_or_buf=None, - sep=",", - na_rep="", + sep: str = ",", + na_rep: str = "", columns=None, - header=True, - index=True, + header: bool = True, + index: bool = True, encoding=None, compression=None, - lineterminator="\n", - chunksize=None, + lineterminator: str = "\n", + chunksize: int | None = None, storage_options=None, ): """{docstring}""" @@ -187,15 +368,10 @@ def to_csv( ) for _, dtype in df._dtypes: - if isinstance(dtype, cudf.ListDtype): - raise NotImplementedError( - "Writing to csv format is not yet supported with " - "list columns." - ) - elif isinstance(dtype, cudf.StructDtype): + if isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): raise NotImplementedError( "Writing to csv format is not yet supported with " - "Struct columns." + f"{dtype} columns." ) # TODO: Need to typecast categorical columns to the underlying @@ -208,7 +384,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._column_labels_and_values: if isinstance(col.dtype, cudf.CategoricalDtype): - df._data[col_name] = col.astype(col.categories.dtype) + df._data[col_name] = col.astype(col.dtype.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) @@ -218,7 +394,7 @@ def to_csv( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libcudf.csv.write_csv( + _plc_write_csv( df, path_or_buf=file_obj, sep=sep, @@ -229,7 +405,7 @@ def to_csv( index=index, ) else: - libcudf.csv.write_csv( + _plc_write_csv( df, path_or_buf=path_or_buf, sep=sep, @@ -243,3 +419,127 @@ def to_csv( if return_as_string: path_or_buf.seek(0) return path_or_buf.read() + + +@acquire_spill_lock() +def _plc_write_csv( + table: cudf.DataFrame, + path_or_buf=None, + sep: str = ",", + na_rep: str = "", + header: bool = True, + lineterminator: str = "\n", + rows_per_chunk: int = 8, + index: bool = True, +) -> None: + iter_columns = ( + itertools.chain(table.index._columns, table._columns) + if index + else table._columns + ) + columns = [col.to_pylibcudf(mode="read") for col in iter_columns] + col_names = [] + if header: + table_names = ( + na_rep if name is None or pd.isnull(name) else name + for name in table._column_names + ) + iter_names = ( + itertools.chain(table.index.names, table_names) + if index + else table_names + ) + all_names = list(iter_names) + col_names = [ + '""' + if (name in (None, "") and len(all_names) == 1) + else (str(name) if name not in (None, "") else "") + for name in all_names + ] + try: + plc.io.csv.write_csv( + ( + plc.io.csv.CsvWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc.Table(columns) + ) + .names(col_names) + .na_rep(na_rep) + .include_header(header) + .rows_per_chunk(rows_per_chunk) + .line_terminator(str(lineterminator)) + .inter_column_delimiter(str(sep)) + .true_value("True") + .false_value("False") + .build() + ) + ) + except OverflowError as err: + raise OverflowError( + f"Writing CSV file with chunksize={rows_per_chunk} failed. " + "Consider providing a smaller chunksize argument." + ) from err + + +def _validate_args( + delimiter: str | None, + sep: str, + delim_whitespace: bool, + decimal: str, + thousands: str | None, + nrows: int | None, + skipfooter: int, + byte_range: list[int] | tuple[int, int] | None, + skiprows: int, +) -> None: + if delim_whitespace: + if delimiter is not None: + raise ValueError("cannot set both delimiter and delim_whitespace") + if sep != ",": + raise ValueError("cannot set both sep and delim_whitespace") + + # Alias sep -> delimiter. + actual_delimiter = delimiter if delimiter else sep + + if decimal == actual_delimiter: + raise ValueError("decimal cannot be the same as delimiter") + + if thousands == actual_delimiter: + raise ValueError("thousands cannot be the same as delimiter") + + if nrows is not None and skipfooter != 0: + raise ValueError("cannot use both nrows and skipfooter parameters") + + if byte_range is not None: + if skipfooter != 0 or skiprows != 0 or nrows is not None: + raise ValueError( + "cannot manually limit rows to be read when using the byte range parameter" + ) + + +def _get_plc_data_type_from_dtype(dtype) -> plc.DataType: + # TODO: Remove this work-around Dictionary types + # in libcudf are fully mapped to categorical columns: + # https://github.com/rapidsai/cudf/issues/3960 + if isinstance(dtype, cudf.CategoricalDtype): + dtype = dtype.categories.dtype + elif dtype == "category": + dtype = "str" + + if isinstance(dtype, str): + if dtype == "date32": + return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS) + elif dtype in ("date", "date64"): + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp[us]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS) + elif dtype == "timestamp[s]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS) + elif dtype == "timestamp[ms]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp[ns]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS) + + dtype = cudf.dtype(dtype) + return dtype_to_pylibcudf_type(dtype)