-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-45457: [Python] Add pyarrow.ArrayStatistics
#45550
Changes from 9 commits
cd2aa85
49bf5ef
5837501
d5a15f5
4a69312
8584053
bf7bdda
bca14f5
fd6d18e
e3a20b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,9 @@ import os | |
import warnings | ||
from cython import sizeof | ||
|
||
cdef extern from "<variant>" namespace "std": | ||
c_bool holds_alternative[T](...) | ||
T get[T](...) | ||
|
||
cdef _sequence_to_array(object sequence, object mask, object size, | ||
DataType type, CMemoryPool* pool, c_bool from_pandas): | ||
|
@@ -704,6 +707,101 @@ def _restore_array(data): | |
return pyarrow_wrap_array(MakeArray(ad)) | ||
|
||
|
||
cdef class ArrayStatistics(_Weakrefable): | ||
""" | ||
The class for statistics of an array. | ||
""" | ||
|
||
def __init__(self): | ||
raise TypeError(f"Do not call {self.__class__.__name__}'s constructor " | ||
"directly") | ||
|
||
cdef void init(self, const shared_ptr[CArrayStatistics]& sp_statistics): | ||
self.sp_statistics = sp_statistics | ||
|
||
def __repr__(self): | ||
return (f"arrow.ArrayStatistics<null_count={self.null_count}, " | ||
f"distinct_count={self.distinct_count}, min={self.min}, " | ||
f"is_min_exact={self.is_min_exact}, max={self.max}, " | ||
f"is_max_exact={self.is_max_exact}>") | ||
|
||
@property | ||
def null_count(self): | ||
""" | ||
The number of nulls. | ||
""" | ||
null_count = self.sp_statistics.get().null_count | ||
# We'll be able to simplify this after | ||
# https://github.com/cython/cython/issues/6692 is solved. | ||
if null_count.has_value(): | ||
return null_count.value() | ||
else: | ||
return None | ||
|
||
@property | ||
def distinct_count(self): | ||
""" | ||
The number of distinct values. | ||
""" | ||
distinct_count = self.sp_statistics.get().distinct_count | ||
# We'll be able to simplify this after | ||
# https://github.com/cython/cython/issues/6692 is solved. | ||
if distinct_count.has_value(): | ||
return distinct_count.value() | ||
else: | ||
return None | ||
|
||
@property | ||
def min(self): | ||
""" | ||
The minimum value. | ||
""" | ||
return self._get_value(self.sp_statistics.get().min) | ||
|
||
@property | ||
def is_min_exact(self): | ||
""" | ||
Whether the minimum value is an exact value or not. | ||
""" | ||
return self.sp_statistics.get().is_min_exact | ||
|
||
@property | ||
def max(self): | ||
""" | ||
The maximum value. | ||
""" | ||
return self._get_value(self.sp_statistics.get().max) | ||
|
||
@property | ||
def is_max_exact(self): | ||
""" | ||
Whether the maximum value is an exact value or not. | ||
""" | ||
return self.sp_statistics.get().is_max_exact | ||
|
||
cdef _get_value(self, const optional[CArrayStatisticsValueType]& optional_value): | ||
""" | ||
Get a raw value from | ||
std::optional<arrow::ArrayStatistics::ValueType>> data. | ||
|
||
arrow::ArrayStatistics::ValueType is | ||
std::variant<bool, int64_t, uint64_t, double, std::string>. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh... The code was wrong... I've added the |
||
""" | ||
if not optional_value.has_value(): | ||
return None | ||
value = optional_value.value() | ||
if holds_alternative[c_bool](value): | ||
return get[c_bool](value) | ||
elif holds_alternative[int64_t](value): | ||
return get[int64_t](value) | ||
elif holds_alternative[uint64_t](value): | ||
return get[uint64_t](value) | ||
elif holds_alternative[double](value): | ||
return get[double](value) | ||
else: | ||
return get[c_string](value) | ||
|
||
|
||
cdef class _PandasConvertible(_Weakrefable): | ||
|
||
def to_pandas( | ||
|
@@ -2099,6 +2197,20 @@ cdef class Array(_PandasConvertible): | |
if self.sp_array.get().device_type() != CDeviceAllocationType_kCPU: | ||
raise NotImplementedError("Implemented only for data on CPU device") | ||
|
||
@property | ||
def statistics(self): | ||
""" | ||
Statistics of the array. | ||
""" | ||
cdef ArrayStatistics stat | ||
sp_stat = self.sp_array.get().statistics() | ||
if sp_stat.get() == nullptr: | ||
return None | ||
else: | ||
stat = ArrayStatistics.__new__(ArrayStatistics) | ||
stat.init(sp_stat) | ||
return stat | ||
|
||
|
||
cdef _array_like_to_pandas(obj, options, types_mapper): | ||
cdef: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -334,3 +334,18 @@ def test_parquet_file_with_filesystem(s3_example_fs, use_uri): | |
assert f.read() == table | ||
assert not f.closed | ||
assert f.closed | ||
|
||
|
||
def test_read_statistics(): | ||
table = pa.table({"value": pa.array([-1, None, 3])}) | ||
buf = io.BytesIO() | ||
_write_table(table, buf) | ||
buf.seek(0) | ||
|
||
statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics | ||
assert statistics.null_count == 1 | ||
assert statistics.distinct_count is None | ||
assert statistics.min == -1 | ||
assert statistics.is_min_exact | ||
assert statistics.max == 3 | ||
assert statistics.is_max_exact | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we have a test for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a good idea. I've added it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the record, I've opened a Cython feature request to make this more automatic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I've added a comment that refers the issue.