Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-45457: [Python] Add pyarrow.ArrayStatistics #45550

Merged
merged 10 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import os
import warnings
from cython import sizeof

cdef extern from "<variant>" namespace "std":
c_bool holds_alternative[T](...)
T get[T](...)

cdef _sequence_to_array(object sequence, object mask, object size,
DataType type, CMemoryPool* pool, c_bool from_pandas):
Expand Down Expand Up @@ -704,6 +707,101 @@ def _restore_array(data):
return pyarrow_wrap_array(MakeArray(ad))


cdef class ArrayStatistics(_Weakrefable):
"""
The class for statistics of an array.
"""

def __init__(self):
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor "
"directly")

cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics):
self.sp_statistics = sp_statistics

def __repr__(self):
return (f"arrow.ArrayStatistics<null_count={self.null_count}, "
f"distinct_count={self.distinct_count}, min={self.min}, "
f"is_min_exact={self.is_min_exact}, max={self.max}, "
f"is_max_exact={self.is_max_exact}>")

@property
def null_count(self):
"""
The number of nulls.
"""
null_count = self.sp_statistics.get().null_count
# We'll be able to simplify this after
# https://github.com/cython/cython/issues/6692 is solved.
if null_count.has_value():
return null_count.value()
else:
return None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the record, I've opened a Cython feature request to make this more automatic.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I've added a comment that refers the issue.


@property
def distinct_count(self):
"""
The number of distinct values.
"""
distinct_count = self.sp_statistics.get().distinct_count
# We'll be able to simplify this after
# https://github.com/cython/cython/issues/6692 is solved.
if distinct_count.has_value():
return distinct_count.value()
else:
return None

@property
def min(self):
"""
The minimum value.
"""
return self._get_value(self.sp_statistics.get().min)

@property
def is_min_exact(self):
"""
Whether the minimum value is an exact value or not.
"""
return self.sp_statistics.get().is_min_exact

@property
def max(self):
"""
The maximum value.
"""
return self._get_value(self.sp_statistics.get().max)

@property
def is_max_exact(self):
"""
Whether the maximum value is an exact value or not.
"""
return self.sp_statistics.get().is_max_exact

cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value):
"""
Get a raw value from
std::optional<arrow::ArrayStatistics::ValueType>> data.

arrow::ArrayStatistics::ValueType is
std::variant<bool, int64_t, uint64_t, double, std::string>.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uint64_t isn't handled below, should the docstring or the code be fixed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh... The code was wrong... I've added the uint64_t case.

"""
if not optional_value.has_value():
return None
value = optional_value.value()
if holds_alternative[c_bool](value):
return get[c_bool](value)
elif holds_alternative[int64_t](value):
return get[int64_t](value)
elif holds_alternative[uint64_t](value):
return get[uint64_t](value)
elif holds_alternative[double](value):
return get[double](value)
else:
return get[c_string](value)


cdef class _PandasConvertible(_Weakrefable):

def to_pandas(
Expand Down Expand Up @@ -2099,6 +2197,20 @@ cdef class Array(_PandasConvertible):
if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU:
raise NotImplementedError("Implemented only for data on CPU device")

@property
def statistics(self):
"""
Statistics of the array.
"""
cdef ArrayStatistics stat
sp_stat = self.sp_array.get().statistics()
if sp_stat.get() == nullptr:
return None
else:
stat = ArrayStatistics.__new__(ArrayStatistics)
stat.init(sp_stat)
return stat


cdef _array_like_to_pandas(obj, options, types_mapper):
cdef:
Expand Down
22 changes: 22 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,16 @@ cdef extern from "arrow/util/future.h" namespace "arrow" nogil:
CStatus status()


cdef extern from "<variant>" namespace "std" nogil:
cdef cppclass CArrayStatisticsValueType" std::variant<bool, int64_t, uint64_t, double, std::string>":
CArrayStatisticsValueType()
CArrayStatisticsValueType(c_bool)
CArrayStatisticsValueType(int64_t)
CArrayStatisticsValueType(uint64_t)
CArrayStatisticsValueType(double)
CArrayStatisticsValueType(c_string)


cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef enum Type" arrow::Type::type":
_Type_NA" arrow::Type::NA"
Expand Down Expand Up @@ -188,6 +198,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool is_primitive(Type type)
c_bool is_numeric(Type type)

cdef cppclass CArrayStatistics" arrow::ArrayStatistics":
optional[int64_t] null_count
optional[int64_t] distinct_count
optional[CArrayStatisticsValueType] min
c_bool is_min_exact
optional[CArrayStatisticsValueType] max
c_bool is_max_exact

c_bool Equals(const CArrayStatistics& statistics) const

cdef cppclass CArrayData" arrow::ArrayData":
shared_ptr[CDataType] type
int64_t length
Expand Down Expand Up @@ -251,6 +271,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CDeviceAllocationType device_type()
CResult[shared_ptr[CArray]] CopyTo(const shared_ptr[CMemoryManager]& to) const

const shared_ptr[CArrayStatistics]& statistics() const

shared_ptr[CArray] MakeArray(const shared_ptr[CArrayData]& data)
CResult[shared_ptr[CArray]] MakeArrayOfNull(
const shared_ptr[CDataType]& type, int64_t length, CMemoryPool* pool)
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,14 @@ cdef class Scalar(_Weakrefable):
cdef inline shared_ptr[CScalar] unwrap(self) nogil


cdef class ArrayStatistics(_Weakrefable):
cdef:
shared_ptr[CArrayStatistics] sp_statistics

cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics) except *
cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value)


cdef class _PandasConvertible(_Weakrefable):
pass

Expand Down
15 changes: 15 additions & 0 deletions python/pyarrow/tests/parquet/test_parquet_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,3 +334,18 @@ def test_parquet_file_with_filesystem(s3_example_fs, use_uri):
assert f.read() == table
assert not f.closed
assert f.closed


def test_read_statistics():
table = pa.table({"value": pa.array([-1, None, 3])})
buf = io.BytesIO()
_write_table(table, buf)
buf.seek(0)

statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
assert statistics.null_count == 1
assert statistics.distinct_count is None
assert statistics.min == -1
assert statistics.is_min_exact
assert statistics.max == 3
assert statistics.is_max_exact
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have a test for repr(statistics) to make sure that the string representation works?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good idea. I've added it.

Loading