diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 91770a52199ca..b738dc04b0c81 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -21,6 +21,9 @@ import os import warnings from cython import sizeof +cdef extern from "" namespace "std": + c_bool holds_alternative[T](...) + T get[T](...) cdef _sequence_to_array(object sequence, object mask, object size, DataType type, CMemoryPool* pool, c_bool from_pandas): @@ -704,6 +707,101 @@ def _restore_array(data): return pyarrow_wrap_array(MakeArray(ad)) +cdef class ArrayStatistics(_Weakrefable): + """ + The class for statistics of an array. + """ + + def __init__(self): + raise TypeError(f"Do not call {self.__class__.__name__}'s constructor " + "directly") + + cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics): + self.sp_statistics = sp_statistics + + def __repr__(self): + return (f"arrow.ArrayStatistics") + + @property + def null_count(self): + """ + The number of nulls. + """ + null_count = self.sp_statistics.get().null_count + # We'll be able to simplify this after + # https://github.com/cython/cython/issues/6692 is solved. + if null_count.has_value(): + return null_count.value() + else: + return None + + @property + def distinct_count(self): + """ + The number of distinct values. + """ + distinct_count = self.sp_statistics.get().distinct_count + # We'll be able to simplify this after + # https://github.com/cython/cython/issues/6692 is solved. + if distinct_count.has_value(): + return distinct_count.value() + else: + return None + + @property + def min(self): + """ + The minimum value. + """ + return self._get_value(self.sp_statistics.get().min) + + @property + def is_min_exact(self): + """ + Whether the minimum value is an exact value or not. + """ + return self.sp_statistics.get().is_min_exact + + @property + def max(self): + """ + The maximum value. + """ + return self._get_value(self.sp_statistics.get().max) + + @property + def is_max_exact(self): + """ + Whether the maximum value is an exact value or not. + """ + return self.sp_statistics.get().is_max_exact + + cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value): + """ + Get a raw value from + std::optional> data. + + arrow::ArrayStatistics::ValueType is + std::variant. + """ + if not optional_value.has_value(): + return None + value = optional_value.value() + if holds_alternative[c_bool](value): + return get[c_bool](value) + elif holds_alternative[int64_t](value): + return get[int64_t](value) + elif holds_alternative[uint64_t](value): + return get[uint64_t](value) + elif holds_alternative[double](value): + return get[double](value) + else: + return get[c_string](value) + + cdef class _PandasConvertible(_Weakrefable): def to_pandas( @@ -2099,6 +2197,20 @@ cdef class Array(_PandasConvertible): if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU: raise NotImplementedError("Implemented only for data on CPU device") + @property + def statistics(self): + """ + Statistics of the array. + """ + cdef ArrayStatistics stat + sp_stat = self.sp_array.get().statistics() + if sp_stat.get() == nullptr: + return None + else: + stat = ArrayStatistics.__new__(ArrayStatistics) + stat.init(sp_stat) + return stat + cdef _array_like_to_pandas(obj, options, types_mapper): cdef: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d4e34e0a84909..556696e3442e9 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -101,6 +101,16 @@ cdef extern from "arrow/util/future.h" namespace "arrow" nogil: CStatus status() +cdef extern from "" namespace "std" nogil: + cdef cppclass CArrayStatisticsValueType" std::variant": + CArrayStatisticsValueType() + CArrayStatisticsValueType(c_bool) + CArrayStatisticsValueType(int64_t) + CArrayStatisticsValueType(uint64_t) + CArrayStatisticsValueType(double) + CArrayStatisticsValueType(c_string) + + cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef enum Type" arrow::Type::type": _Type_NA" arrow::Type::NA" @@ -188,6 +198,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool is_primitive(Type type) c_bool is_numeric(Type type) + cdef cppclass CArrayStatistics" arrow::ArrayStatistics": + optional[int64_t] null_count + optional[int64_t] distinct_count + optional[CArrayStatisticsValueType] min + c_bool is_min_exact + optional[CArrayStatisticsValueType] max + c_bool is_max_exact + + c_bool Equals(const CArrayStatistics& statistics) const + cdef cppclass CArrayData" arrow::ArrayData": shared_ptr[CDataType] type int64_t length @@ -251,6 +271,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CDeviceAllocationType device_type() CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]& to) const + const shared_ptr[CArrayStatistics]& statistics() const + shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data) CResult[shared_ptr[CArray]] MakeArrayOfNull( const shared_ptr[CDataType]& type, int64_t length, CMemoryPool* pool) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 892c974ab12a4..0b2dedad50929 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -261,6 +261,14 @@ cdef class Scalar(_Weakrefable): cdef inline shared_ptr[CScalar] unwrap(self) nogil +cdef class ArrayStatistics(_Weakrefable): + cdef: + shared_ptr[CArrayStatistics] sp_statistics + + cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics) except * + cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value) + + cdef class _PandasConvertible(_Weakrefable): pass diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 93097a1afaac9..ae8a16e874acc 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -334,3 +334,22 @@ def test_parquet_file_with_filesystem(s3_example_fs, use_uri): assert f.read() == table assert not f.closed assert f.closed + + +def test_read_statistics(): + table = pa.table({"value": pa.array([-1, None, 3])}) + buf = io.BytesIO() + _write_table(table, buf) + buf.seek(0) + + statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics + assert statistics.null_count == 1 + assert statistics.distinct_count is None + assert statistics.min == -1 + assert statistics.is_min_exact + assert statistics.max == 3 + assert statistics.is_max_exact + assert repr(statistics) == ("arrow.ArrayStatistics<" + "null_count=1, distinct_count=None, " + "min=-1, is_min_exact=True, " + "max=3, is_max_exact=True>")