Skip to content

Commit

Permalink
add __len__ for PandasDataFrame
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev committed Nov 6, 2023
1 parent 71ebe9e commit 4cc0b75
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 14 deletions.
29 changes: 18 additions & 11 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,20 @@ def row_lengths(self):
self._row_lengths_cache = []
return self._row_lengths_cache

def __len__(self) -> int:
"""
Return length of index axis.
Returns
-------
int
"""
if self.has_materialized_index:
_len = len(self.index)
else:
_len = sum(self.row_lengths)
return _len

@property
def column_widths(self):
"""
Expand Down Expand Up @@ -2421,10 +2435,8 @@ def _apply_func_to_range_partitioning(

# don't want to inherit over-partitioning so doing this 'min' check
ideal_num_new_partitions = min(len(self._partitions), NPartitions.get())
m = sum(self.row_lengths) / ideal_num_new_partitions
sampling_probability = (1 / m) * np.log(
ideal_num_new_partitions * sum(self.row_lengths)
)
m = len(self) / ideal_num_new_partitions
sampling_probability = (1 / m) * np.log(ideal_num_new_partitions * len(self))
# If this df is overpartitioned, we try to sample each partition with probability
# greater than 1, which leads to an error. In this case, we can do one of the following
# two things. If there is only enough rows for one partition, and we have only 1 column
Expand All @@ -2435,13 +2447,8 @@ def _apply_func_to_range_partitioning(
if sampling_probability >= 1:
from modin.config import MinPartitionSize

ideal_num_new_partitions = round(
sum(self.row_lengths) / MinPartitionSize.get()
)
if (
sum(self.row_lengths) < MinPartitionSize.get()
or ideal_num_new_partitions < 2
):
ideal_num_new_partitions = round(len(self) / MinPartitionSize.get())
if len(self) < MinPartitionSize.get() or ideal_num_new_partitions < 2:
# If the data is too small, we shouldn't try reshuffling/repartitioning but rather
# simply combine all partitions and apply the sorting to the whole dataframe
return self.combine_and_apply(func=func)
Expand Down
2 changes: 1 addition & 1 deletion modin/core/dataframe/pandas/dataframe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def __init__(
ideal_num_new_partitions: int,
**kwargs: dict,
):
self.frame_len = sum(modin_frame.row_lengths)
self.frame_len = len(modin_frame)
self.ideal_num_new_partitions = ideal_num_new_partitions
self.columns = columns if is_list_like(columns) else [columns]
self.ascending = ascending
Expand Down
4 changes: 2 additions & 2 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3637,7 +3637,7 @@ def _groupby_shuffle(
# Higher API level won't pass empty data here unless the frame has delayed
# computations. FIXME: We apparently lose some laziness here (due to index access)
# because of the inability to process empty groupby natively.
if len(self.columns) == 0 or sum(self._modin_frame.row_lengths) == 0:
if len(self.columns) == 0 or len(self._modin_frame) == 0:
return super().groupby_agg(
by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
)
Expand Down Expand Up @@ -3832,7 +3832,7 @@ def groupby_agg(
# Higher API level won't pass empty data here unless the frame has delayed
# computations. So we apparently lose some laziness here (due to index access)
# because of the inability to process empty groupby natively.
if len(self.columns) == 0 or sum(self._modin_frame.row_lengths) == 0:
if len(self.columns) == 0 or len(self._modin_frame) == 0:
return super().groupby_agg(
by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
)
Expand Down

0 comments on commit 4cc0b75

Please sign in to comment.