add __len__ for PandasDataFrame

Signed-off-by: Anatoly Myachev <[email protected]>
modin-project · Nov 6, 2023 · 4cc0b75 · 4cc0b75
1 parent 71ebe9e
commit 4cc0b75
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 14 deletions.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -191,6 +191,20 @@ def row_lengths(self):
                 self._row_lengths_cache = []
         return self._row_lengths_cache
 
+    def __len__(self) -> int:
+        """
+        Return length of index axis.
+
+        Returns
+        -------
+        int
+        """
+        if self.has_materialized_index:
+            _len = len(self.index)
+        else:
+            _len = sum(self.row_lengths)
+        return _len
+
     @property
     def column_widths(self):
         """
@@ -2421,10 +2435,8 @@ def _apply_func_to_range_partitioning(
 
         # don't want to inherit over-partitioning so doing this 'min' check
         ideal_num_new_partitions = min(len(self._partitions), NPartitions.get())
-        m = sum(self.row_lengths) / ideal_num_new_partitions
-        sampling_probability = (1 / m) * np.log(
-            ideal_num_new_partitions * sum(self.row_lengths)
-        )
+        m = len(self) / ideal_num_new_partitions
+        sampling_probability = (1 / m) * np.log(ideal_num_new_partitions * len(self))
         # If this df is overpartitioned, we try to sample each partition with probability
         # greater than 1, which leads to an error. In this case, we can do one of the following
         # two things. If there is only enough rows for one partition, and we have only 1 column
@@ -2435,13 +2447,8 @@ def _apply_func_to_range_partitioning(
         if sampling_probability >= 1:
             from modin.config import MinPartitionSize
 
-            ideal_num_new_partitions = round(
-                sum(self.row_lengths) / MinPartitionSize.get()
-            )
-            if (
-                sum(self.row_lengths) < MinPartitionSize.get()
-                or ideal_num_new_partitions < 2
-            ):
+            ideal_num_new_partitions = round(len(self) / MinPartitionSize.get())
+            if len(self) < MinPartitionSize.get() or ideal_num_new_partitions < 2:
                 # If the data is too small, we shouldn't try reshuffling/repartitioning but rather
                 # simply combine all partitions and apply the sorting to the whole dataframe
                 return self.combine_and_apply(func=func)

diff --git a/modin/core/dataframe/pandas/dataframe/utils.py b/modin/core/dataframe/pandas/dataframe/utils.py
@@ -132,7 +132,7 @@ def __init__(
         ideal_num_new_partitions: int,
         **kwargs: dict,
     ):
-        self.frame_len = sum(modin_frame.row_lengths)
+        self.frame_len = len(modin_frame)
         self.ideal_num_new_partitions = ideal_num_new_partitions
         self.columns = columns if is_list_like(columns) else [columns]
         self.ascending = ascending

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -3637,7 +3637,7 @@ def _groupby_shuffle(
         # Higher API level won't pass empty data here unless the frame has delayed
         # computations. FIXME: We apparently lose some laziness here (due to index access)
         # because of the inability to process empty groupby natively.
-        if len(self.columns) == 0 or sum(self._modin_frame.row_lengths) == 0:
+        if len(self.columns) == 0 or len(self._modin_frame) == 0:
             return super().groupby_agg(
                 by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
             )
@@ -3832,7 +3832,7 @@ def groupby_agg(
         # Higher API level won't pass empty data here unless the frame has delayed
         # computations. So we apparently lose some laziness here (due to index access)
         # because of the inability to process empty groupby natively.
-        if len(self.columns) == 0 or sum(self._modin_frame.row_lengths) == 0:
+        if len(self.columns) == 0 or len(self._modin_frame) == 0:
             return super().groupby_agg(
                 by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
             )