From 521eb6034c7d0b4abf4f3ad84acf330612038358 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 31 Oct 2023 15:42:29 +0100 Subject: [PATCH] PERF-#6668: Use `copy=False` for internal usage of `set_axis` (#6667) Signed-off-by: Anatoly Myachev --- modin/core/dataframe/pandas/dataframe/dataframe.py | 7 ++++++- modin/pandas/groupby.py | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 5d2b32d9f36..04f1f7448fd 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -808,7 +808,12 @@ def _propagate_index_objs(self, axis=None): if axis is None: def apply_idx_objs(df, idx, cols): - return df.set_axis(idx, axis="index").set_axis(cols, axis="columns") + # We should make at least one copy to avoid the data modification problem + # that may arise when sharing buffers from distributed storage + # (zero-copy pickling). + return df.set_axis(idx, axis="index").set_axis( + cols, axis="columns", copy=False + ) self._partitions = np.array( [ diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 8601bda9464..fb496cd979c 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -1990,7 +1990,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # because there is no need to identify which original column's aggregation # the new column represents. alternatively we could give the query compiler # a hint that it's for a series, not a dataframe. - return result.set_axis(labels=self._try_get_str_func(func), axis=1) + return result.set_axis( + labels=self._try_get_str_func(func), axis=1, copy=False + ) else: return super().aggregate( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs