Skip to content

Commit

Permalink
apacheGH-45457: [Python] Add pyarrow.ArrayStatistics (apache#45550)
Browse files Browse the repository at this point in the history
### Rationale for this change

Apache Arrow C++ can attach statistics read from Apache Parquet data to `arrow::Array`. If we have the bindings of the feature in Python, Python users can also use attached statistics. 

### What changes are included in this PR?

* Add `pyarrow.ArrayStatistics`
* Add `pyarrow.Array.statistics()`.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: apache#45457

Lead-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
3 people authored and arashandishgar committed Feb 25, 2025
1 parent d0a0001 commit 0936f1e
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 0 deletions.
112 changes: 112 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import os
import warnings
from cython import sizeof

cdef extern from "<variant>" namespace "std":
c_bool holds_alternative[T](...)
T get[T](...)

cdef _sequence_to_array(object sequence, object mask, object size,
DataType type, CMemoryPool* pool, c_bool from_pandas):
Expand Down Expand Up @@ -704,6 +707,101 @@ def _restore_array(data):
return pyarrow_wrap_array(MakeArray(ad))


cdef class ArrayStatistics(_Weakrefable):
"""
The class for statistics of an array.
"""

def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor "
"directly")

cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics):
self.sp_statistics = sp_statistics

def __repr__(self):
return (f"arrow.ArrayStatistics<null_count={self.null_count}, "
f"distinct_count={self.distinct_count}, min={self.min}, "
f"is_min_exact={self.is_min_exact}, max={self.max}, "
f"is_max_exact={self.is_max_exact}>")

@property
def null_count(self):
"""
The number of nulls.
"""
null_count = self.sp_statistics.get().null_count
# We'll be able to simplify this after
# https://github.com/cython/cython/issues/6692 is solved.
if null_count.has_value():
return null_count.value()
else:
return None

@property
def distinct_count(self):
"""
The number of distinct values.
"""
distinct_count = self.sp_statistics.get().distinct_count
# We'll be able to simplify this after
# https://github.com/cython/cython/issues/6692 is solved.
if distinct_count.has_value():
return distinct_count.value()
else:
return None

@property
def min(self):
"""
The minimum value.
"""
return self._get_value(self.sp_statistics.get().min)

@property
def is_min_exact(self):
"""
Whether the minimum value is an exact value or not.
"""
return self.sp_statistics.get().is_min_exact

@property
def max(self):
"""
The maximum value.
"""
return self._get_value(self.sp_statistics.get().max)

@property
def is_max_exact(self):
"""
Whether the maximum value is an exact value or not.
"""
return self.sp_statistics.get().is_max_exact

cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value):
"""
Get a raw value from
std::optional<arrow::ArrayStatistics::ValueType>> data.
arrow::ArrayStatistics::ValueType is
std::variant<bool, int64_t, uint64_t, double, std::string>.
"""
if not optional_value.has_value():
return None
value = optional_value.value()
if holds_alternative[c_bool](value):
return get[c_bool](value)
elif holds_alternative[int64_t](value):
return get[int64_t](value)
elif holds_alternative[uint64_t](value):
return get[uint64_t](value)
elif holds_alternative[double](value):
return get[double](value)
else:
return get[c_string](value)


cdef class _PandasConvertible(_Weakrefable):

def to_pandas(
Expand Down Expand Up @@ -2099,6 +2197,20 @@ cdef class Array(_PandasConvertible):
if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU:
raise NotImplementedError("Implemented only for data on CPU device")

@property
def statistics(self):
"""
Statistics of the array.
"""
cdef ArrayStatistics stat
sp_stat = self.sp_array.get().statistics()
if sp_stat.get() == nullptr:
return None
else:
stat = ArrayStatistics.__new__(ArrayStatistics)
stat.init(sp_stat)
return stat


cdef _array_like_to_pandas(obj, options, types_mapper):
cdef:
Expand Down
22 changes: 22 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,16 @@ cdef extern from "arrow/util/future.h" namespace "arrow" nogil:
CStatus status()


cdef extern from "<variant>" namespace "std" nogil:
cdef cppclass CArrayStatisticsValueType" std::variant<bool, int64_t, uint64_t, double, std::string>":
CArrayStatisticsValueType()
CArrayStatisticsValueType(c_bool)
CArrayStatisticsValueType(int64_t)
CArrayStatisticsValueType(uint64_t)
CArrayStatisticsValueType(double)
CArrayStatisticsValueType(c_string)


cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef enum Type" arrow::Type::type":
_Type_NA" arrow::Type::NA"
Expand Down Expand Up @@ -188,6 +198,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool is_primitive(Type type)
c_bool is_numeric(Type type)

cdef cppclass CArrayStatistics" arrow::ArrayStatistics":
optional[int64_t] null_count
optional[int64_t] distinct_count
optional[CArrayStatisticsValueType] min
c_bool is_min_exact
optional[CArrayStatisticsValueType] max
c_bool is_max_exact

c_bool Equals(const CArrayStatistics& statistics) const

cdef cppclass CArrayData" arrow::ArrayData":
shared_ptr[CDataType] type
int64_t length
Expand Down Expand Up @@ -251,6 +271,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CDeviceAllocationType device_type()
CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]& to) const

const shared_ptr[CArrayStatistics]& statistics() const

shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data)
CResult[shared_ptr[CArray]] MakeArrayOfNull(
const shared_ptr[CDataType]& type, int64_t length, CMemoryPool* pool)
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,14 @@ cdef class Scalar(_Weakrefable):
cdef inline shared_ptr[CScalar] unwrap(self) nogil


cdef class ArrayStatistics(_Weakrefable):
cdef:
shared_ptr[CArrayStatistics] sp_statistics

cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics) except *
cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value)


cdef class _PandasConvertible(_Weakrefable):
pass

Expand Down
19 changes: 19 additions & 0 deletions python/pyarrow/tests/parquet/test_parquet_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,3 +334,22 @@ def test_parquet_file_with_filesystem(s3_example_fs, use_uri):
assert f.read() == table
assert not f.closed
assert f.closed


def test_read_statistics():
table = pa.table({"value": pa.array([-1, None, 3])})
buf = io.BytesIO()
_write_table(table, buf)
buf.seek(0)

statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
assert statistics.null_count == 1
assert statistics.distinct_count is None
assert statistics.min == -1
assert statistics.is_min_exact
assert statistics.max == 3
assert statistics.is_max_exact
assert repr(statistics) == ("arrow.ArrayStatistics<"
"null_count=1, distinct_count=None, "
"min=-1, is_min_exact=True, "
"max=3, is_max_exact=True>")

0 comments on commit 0936f1e

Please sign in to comment.