modin-project · dchigarev · Nov 7, 2023 · Nov 4, 2023 · Nov 6, 2023
@@ -191,6 +191,20 @@ def row_lengths(self):
                 self._row_lengths_cache = []
         return self._row_lengths_cache
 
+    def __len__(self) -> int:
+        """
+        Return length of index axis.
+
+        Returns
+        -------
+        int
+        """
+        if self.has_materialized_index:
+            _len = len(self.index)
+        else:
+            _len = sum(self.row_lengths)
+        return _len
+
     @property
     def column_widths(self):
         """
@@ -2421,10 +2435,8 @@ def _apply_func_to_range_partitioning(
 
         # don't want to inherit over-partitioning so doing this 'min' check
         ideal_num_new_partitions = min(len(self._partitions), NPartitions.get())
-        m = len(self.index) / ideal_num_new_partitions
-        sampling_probability = (1 / m) * np.log(
-            ideal_num_new_partitions * len(self.index)
-        )
+        m = len(self) / ideal_num_new_partitions
+        sampling_probability = (1 / m) * np.log(ideal_num_new_partitions * len(self))
         # If this df is overpartitioned, we try to sample each partition with probability
         # greater than 1, which leads to an error. In this case, we can do one of the following
         # two things. If there is only enough rows for one partition, and we have only 1 column
@@ -2435,8 +2447,8 @@ def _apply_func_to_range_partitioning(
         if sampling_probability >= 1:
             from modin.config import MinPartitionSize
 
-            ideal_num_new_partitions = round(len(self.index) / MinPartitionSize.get())
-            if len(self.index) < MinPartitionSize.get() or ideal_num_new_partitions < 2:
+            ideal_num_new_partitions = round(len(self) / MinPartitionSize.get())
+            if len(self) < MinPartitionSize.get() or ideal_num_new_partitions < 2:
                 # If the data is too small, we shouldn't try reshuffling/repartitioning but rather
                 # simply combine all partitions and apply the sorting to the whole dataframe
                 return self.combine_and_apply(func=func)

@@ -132,7 +132,7 @@ def __init__(
         ideal_num_new_partitions: int,
         **kwargs: dict,
     ):
-        self.frame_len = len(modin_frame.index)
+        self.frame_len = len(modin_frame)
         self.ideal_num_new_partitions = ideal_num_new_partitions
         self.columns = columns if is_list_like(columns) else [columns]
         self.ascending = ascending

@@ -3637,7 +3637,7 @@ def _groupby_shuffle(
         # Higher API level won't pass empty data here unless the frame has delayed
         # computations. FIXME: We apparently lose some laziness here (due to index access)
         # because of the inability to process empty groupby natively.
-        if len(self.columns) == 0 or len(self.index) == 0:
+        if len(self.columns) == 0 or len(self._modin_frame) == 0:
             return super().groupby_agg(
                 by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
             )
@@ -3832,7 +3832,7 @@ def groupby_agg(
         # Higher API level won't pass empty data here unless the frame has delayed
         # computations. So we apparently lose some laziness here (due to index access)
         # because of the inability to process empty groupby natively.
-        if len(self.columns) == 0 or len(self.index) == 0:
+        if len(self.columns) == 0 or len(self._modin_frame) == 0:
             return super().groupby_agg(
                 by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
             )