From f5f9ae993ba5ed26461d3c9d26fbefecab88ee69 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Mon, 22 Jul 2024 05:42:51 -0700 Subject: [PATCH 01/20] DOCS-#0000: Update RunLLM Ask AI widget script path (#7345) Signed-off-by: Wei Chen --- docs/_static/custom.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/_static/custom.js b/docs/_static/custom.js index 6b867194374..552612baaa5 100644 --- a/docs/_static/custom.js +++ b/docs/_static/custom.js @@ -3,9 +3,8 @@ document.addEventListener("DOMContentLoaded", function () { script.type = "module"; script.id = "runllm-widget-script" - script.src = "https://cdn.jsdelivr.net/npm/@runllm/search-widget@stable/dist/run-llm-search-widget.es.js"; + script.src = "https://widget.runllm.com"; - script.setAttribute("version", "stable"); script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "Modin"); script.setAttribute("runllm-position", "BOTTOM_RIGHT"); From 7c1dde071632abb8c54eff7da0ab9d6448cef863 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 24 Jul 2024 15:39:20 -0500 Subject: [PATCH 02/20] FEAT-#7331: Initial Polars API (#7332) * FEAT-#7331: Initial Polars API This commit adds a polars namespace to Modin, and the DataFrame and Series objects and their respective APIs. This doesn't include error handling and is still missing several polars features: * LazyFrame * Expressions * String, Temporal, Struct, and other Series accessors * Several parameters * Operators that we don't have query compiler methods for * e.g. sin, cos, tan, etc. Those will be handled in a future PR. Signed-off-by: Devin Petersohn * Lint Signed-off-by: Devin Petersohn * flake8 Signed-off-by: Devin Petersohn * isort Signed-off-by: Devin Petersohn * headers Signed-off-by: Devin Petersohn * forgot one Signed-off-by: Devin Petersohn * Add test Signed-off-by: Devin Petersohn * header Signed-off-by: Devin Petersohn * isort Signed-off-by: Devin Petersohn * Add to CI Signed-off-by: Devin Petersohn * fix name Signed-off-by: Devin Petersohn * Update modin/polars/base.py Co-authored-by: Mahesh Vashishtha * address comments Signed-off-by: Devin Petersohn * polars 1 Signed-off-by: Devin Petersohn * Update for polars 1.x and fix some hacks Signed-off-by: Devin Petersohn * Remove hax Signed-off-by: Devin Petersohn * Black Signed-off-by: Devin Petersohn * Address comments Signed-off-by: Devin Petersohn * Lint Signed-off-by: Devin Petersohn * Address comment Signed-off-by: Devin Petersohn --------- Signed-off-by: Devin Petersohn Co-authored-by: Devin Petersohn Co-authored-by: Mahesh Vashishtha --- .github/workflows/ci.yml | 1 + environment-dev.yml | 1 + modin/polars/__init__.py | 17 + modin/polars/base.py | 668 ++++++++ modin/polars/dataframe.py | 1439 +++++++++++++++++ modin/polars/groupby.py | 247 +++ modin/polars/lazyframe.py | 22 + modin/polars/series.py | 2159 ++++++++++++++++++++++++++ modin/tests/polars/test_dataframe.py | 25 + 9 files changed, 4579 insertions(+) create mode 100644 modin/polars/__init__.py create mode 100644 modin/polars/base.py create mode 100644 modin/polars/dataframe.py create mode 100644 modin/polars/groupby.py create mode 100644 modin/polars/lazyframe.py create mode 100644 modin/polars/series.py create mode 100644 modin/tests/polars/test_dataframe.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9ec1ca9d22d..5f82d4ca7f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -462,6 +462,7 @@ jobs: if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py if: matrix.engine == 'python' || matrix.test_task == 'group_4' + - run: python -m pytest modin/tests/polars/test_dataframe.py - run: | python -m pip install lazy_import python -m pytest modin/tests/pandas/integrations/ diff --git a/environment-dev.yml b/environment-dev.yml index 3ea51032bde..049b3e39830 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -70,3 +70,4 @@ dependencies: - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.6.0 + - polars diff --git a/modin/polars/__init__.py b/modin/polars/__init__.py new file mode 100644 index 00000000000..3407698eb64 --- /dev/null +++ b/modin/polars/__init__.py @@ -0,0 +1,17 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from modin.polars.dataframe import DataFrame +from modin.polars.series import Series + +__all__ = ["DataFrame", "Series"] diff --git a/modin/polars/base.py b/modin/polars/base.py new file mode 100644 index 00000000000..010ee9e946c --- /dev/null +++ b/modin/polars/base.py @@ -0,0 +1,668 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Implement DataFrame/Series public API as polars does.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Sequence + +import polars + +from modin.core.storage_formats import BaseQueryCompiler + +if TYPE_CHECKING: + import numpy as np + + from modin.polars import DataFrame, Series + + +class BasePolarsDataset: + + _query_compiler: BaseQueryCompiler + + @property + def __constructor__(self): + """ + DataFrame constructor. + + Returns: + Constructor of the DataFrame + """ + return type(self) + + def __eq__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.eq( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __ne__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.ne( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __add__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.add( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __sub__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.sub( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __mul__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.mul( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __truediv__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.truediv( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __floordiv__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.floordiv( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __mod__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.mod( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __pow__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.pow( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __and__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.__and__( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __or__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.__or__( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __xor__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.__xor__( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __lt__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.lt( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __le__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.le( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __gt__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.gt( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __ge__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.ge( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __invert__(self) -> "BasePolarsDataset": + return self.__constructor__(_query_compiler=self._query_compiler.invert()) + + def __neg__(self) -> "BasePolarsDataset": + return self.__constructor__(_query_compiler=self._query_compiler.negative()) + + def __abs__(self) -> "BasePolarsDataset": + return self.__constructor__(_query_compiler=self._query_compiler.abs()) + + def is_duplicated(self): + """ + Determine whether each row is a duplicate in the DataFrame. + + Returns: + DataFrame with True for each duplicate row, and False for unique rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.duplicated(keep=False) + ) + + def is_empty(self) -> bool: + """ + Determine whether the DataFrame is empty. + + Returns: + True if the DataFrame is empty, False otherwise + """ + return self.height == 0 + + def is_unique(self): + """ + Determine whether each row is unique in the DataFrame. + + Returns: + DataFrame with True for each unique row, and False for duplicate rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.duplicated(keep=False).invert() + ) + + def n_chunks(self, strategy: str = "first") -> int | list[int]: + raise NotImplementedError("not yet") + + def to_arrow(self): + """ + Convert the DataFrame to Arrow format. + + Returns: + Arrow representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_arrow() + + def to_jax(self, device=None): + """ + Convert the DataFrame to JAX format. + + Args: + device: The device to use. + + Returns: + JAX representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_jax( + device=device + ) + + def to_numpy( + self, + *, + writable: bool = False, + allow_copy: bool = True, + use_pyarrow: bool | None = None, + zero_copy_only: bool | None = None, + ) -> "np.ndarray": + """ + Convert the DataFrame to a NumPy representation. + + Args: + writable: Whether the NumPy array should be writable. + allow_copy: Whether to allow copying the data. + use_pyarrow: Whether to use PyArrow for conversion. + zero_copy_only: Whether to use zero-copy conversion only. + + Returns: + NumPy representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_numpy( + writable=writable, + allow_copy=allow_copy, + use_pyarrow=use_pyarrow, + zero_copy_only=zero_copy_only, + ) + + def to_torch(self): + """ + Convert the DataFrame to PyTorch format. + + Returns: + PyTorch representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_torch() + + def bottom_k( + self, + k: int, + *, + by, + descending: bool | Sequence[bool] = False, + nulls_last: bool | Sequence[bool] | None = None, + maintain_order: bool | None = None, + ) -> "BasePolarsDataset": + raise NotImplementedError("not yet") + + def cast(self, dtypes, *, strict: bool = True) -> "BasePolarsDataset": + """ + Cast the DataFrame to the given dtypes. + + Args: + dtypes: Dtypes to cast the DataFrame to. + strict: Whether to enforce strict casting. + + Returns: + DataFrame with the new dtypes. + """ + # TODO: support strict + return self.__constructor__(_query_compiler=self._query_compiler.astype(dtypes)) + + def clone(self) -> "BasePolarsDataset": + """ + Clone the DataFrame. + + Returns: + Cloned DataFrame. + """ + return self.copy() + + def drop_nulls(self, subset=None): + """ + Drop the rows with null values. + + Args: + subset: Columns to consider for null values. + + Returns: + DataFrame with the rows with null values dropped. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.dropna(subset=subset, how="any") + ) + + def explode(self, columns: str, *more_columns: str) -> "BasePolarsDataset": + """ + Explode the given columns to long format. + + Args: + columns: Columns to explode. + more_columns: Additional columns to explode. + + Returns: + DataFrame with the columns exploded. + """ + if len(more_columns) > 0: + columns = [columns, *more_columns] + return self.__constructor__( + _query_compiler=self._query_compiler.explode(columns) + ) + + def extend(self, other: "BasePolarsDataset") -> "BasePolarsDataset": + """ + Extend the DataFrame with another DataFrame. + + Args: + other: DataFrame to extend with. + + Returns: + Extended DataFrame for convenience. DataFrame is modified in place. + """ + self._query_compiler = self._query_compiler.concat( + axis=0, other=other._query_compiler + ) + return self + + def fill_nan(self, value): + """ + Fill NaN values with the given value. + + Args: + value: Value to fill NaN values with. + + Returns: + DataFrame with NaN values filled. + """ + # TODO: Handle null values differently than nan. + return self.__constructor__(_query_compiler=self._query_compiler.fillna(value)) + + def fill_null( + self, + value: Any | None = None, + strategy: str | None = None, + limit: int | None = None, + *, + matches_supertype: bool = True, + ) -> "BasePolarsDataset": + """ + Fill null values with the given value or strategy. + + Args: + value: Value to fill null values with. + strategy: Strategy to fill null values with. + limit: Maximum number of null values to fill. + matches_supertype: Whether the value matches the supertype. + + Returns: + DataFrame with null values filled. + """ + if strategy == "forward": + strategy = "ffill" + elif strategy == "backward": + strategy = "bfill" + elif strategy in ["min", "max", "mean"]: + value = getattr(self, strategy)()._query_compiler + strategy = None + elif strategy == "zero": + strategy = None + value = 0 + elif strategy == "one": + strategy = None + value = 1 + else: + raise ValueError(f"Unknown strategy: {strategy}") + return self.__constructor__( + _query_compiler=self._query_compiler.fillna( + value=value, method=strategy, limit=limit + ) + ) + + def filter(self, *predicates, **constraints: Any) -> "BasePolarsDataset": + predicates = predicates[0] + for p in predicates[1:]: + predicates = predicates & p + if constraints: + raise NotImplementedError("Named constraints are not supported") + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_array( + predicates._query_compiler + ) + ) + + def gather_every(self, n: int, offset: int = 0) -> "BasePolarsDataset": + """ + Gather every nth row of the DataFrame. + + Args: + n: Number of rows to gather. + offset: Offset to start gathering from. + + Returns: + DataFrame with every nth row gathered. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array( + slice(offset, None, n) + ) + ) + + def head(self, n: int = 5) -> "BasePolarsDataset": + """ + Get the first n rows of the DataFrame. + + Args: + n: Number of rows to get. + + Returns: + DataFrame with the first n rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array(slice(0, n)) + ) + + def limit(self, n: int = 10) -> "BasePolarsDataset": + """ + Limit the DataFrame to the first n rows. + + Args: + n: Number of rows to limit to. + + Returns: + DataFrame with the first n rows. + """ + return self.head(n) + + def interpolate(self) -> "BasePolarsDataset": + """ + Interpolate values the DataFrame using a linear method. + + Returns: + DataFrame with the interpolated values. + """ + return self.__constructor__(_query_compiler=self._query_compiler.interpolate()) + + def sample( + self, + n: int | "Series" | None = None, + *, + fraction: float | "Series" | None = None, + with_replacement: bool = False, + shuffle: bool = False, + seed: int | None = None, + ) -> "BasePolarsDataset": + """ + Sample the DataFrame. + + Args: + n: Number of rows to sample. + fraction: Fraction of rows to sample. + with_replacement: Whether to sample with replacement. + shuffle: Whether to shuffle the rows. + seed: Seed for the random number generator. + + Returns: + Sampled DataFrame. + """ + return self.__constructor__( + _query_compiler=self.to_pandas() + .sample(n=n, frac=fraction, replace=with_replacement, random_state=seed) + ._query_compiler + ) + + def shift(self, n: int = 1, *, fill_value=None) -> "DataFrame": + raise NotImplementedError("not yet") + + def shrink_to_fit(self) -> "DataFrame": + """ + Shrink the DataFrame to fit in memory. + + Returns: + A copy of the DataFrame. + """ + return self.copy() + + def slice(self, offset: int, length: int) -> "DataFrame": + """ + Slice the DataFrame. + + Args: + offset: Offset to start the slice from. + length: Length of the slice. + + Returns: + Sliced DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array( + slice(offset, offset + length) + ) + ) + + def sort( + self, + by, + *more_by, + descending: bool | Sequence[bool] = False, + nulls_last: bool | Sequence[bool] | None = None, + multithreaded: bool = True, + maintain_order: bool = False, + ) -> "DataFrame": + """ + Sort the DataFrame. + + Args: + by: Column to sort by. + more_by: Additional columns to sort by. + descending: Whether to sort in descending order. + nulls_last: Whether to sort null values last. + multithreaded: Whether to use multiple threads. + maintain_order: Whether to maintain the order of the DataFrame. + + Returns: + Sorted DataFrame. + """ + # TODO: support expressions in by + if len(more_by) > 0: + by = [by, *more_by] + return self.__constructor__( + _query_compiler=self._query_compiler.sort_rows_by_column_values( + by=by, + reverse=descending, + nulls_first=None if nulls_last is None else not nulls_last, + ) + ) + + def tail(self, n: int = 5) -> "DataFrame": + """ + Get the last n rows of the DataFrame. + + Args: + n: Number of rows to get. + + Returns: + DataFrame with the last n rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array(slice(-n, None)) + ) + + def to_dummies( + self, + columns: str | Sequence[str] | None = None, + *, + separator: str = "_", + drop_first: bool = False, + ) -> "DataFrame": + """ + Convert the columns to dummy variables. + + Args: + columns: Columns to convert to dummy variables. + separator: Separator for the dummy variables. + drop_first: Whether to drop the first dummy variable. + + Returns: + DataFrame with the columns converted to dummy variables. + """ + if columns is not None: + if isinstance(columns, str): + columns = [columns] + else: + columns = self.columns + result = self.__constructor__( + _query_compiler=self._query_compiler.get_dummies(columns) + ) + if separator != "_": + result.columns = [ + c.replace(separator, "_") if separator in c else c + for c in result.columns + ] + if drop_first: + columns_to_drop = [ + next( + result_col + for result_col in result.columns + if result_col.startswith(c) + ) + for c in columns + ] + return result.drop(columns_to_drop) + else: + return result + + def top_k( + self, + k: int, + *, + by, + descending: bool | Sequence[bool] = False, + nulls_last: bool | Sequence[bool] | None = None, + maintain_order: bool | None = None, + ) -> "DataFrame": + raise NotImplementedError("not yet") + + def unique(self, subset=None, *, keep="any", maintain_order: bool = False): + """ + Get the unique values in each column. + + Args: + subset: Columns to consider for unique values. + keep: Strategy to keep unique values. + maintain_order: Whether to maintain the order of the unique values. + + Returns: + DataFrame with the unique values in each column. + """ + if keep == "none" or keep == "last": + # TODO: support keep="none" + raise NotImplementedError("not yet") + return self.__constructor__( + _query_compiler=self._query_compiler.unique(subset=subset) + ) + + def equals(self, other: "BasePolarsDataset", *, null_equal: bool = True) -> bool: + """ + Determine whether the DataFrame is equal to another DataFrame. + + Args: + other: DataFrame to compare with. + + Returns: + True if the DataFrames are equal, False otherwise. + """ + return ( + isinstance(other, type(self)) + and self._query_compiler.equals(other._query_compiler) + and ( + null_equal + or ( + not self.to_pandas().isna().any(axis=None) + and not other.to_pandas().isna().any(axis=None) + ) + ) + ) + + @property + def plot(self): + return polars.from_pandas(self._query_compiler.to_pandas()).plot + + def count(self): + """ + Get the number of non-null values in each column. + + Returns: + DataFrame with the counts. + """ + return self.__constructor__(_query_compiler=self._query_compiler.count(axis=0)) diff --git a/modin/polars/dataframe.py b/modin/polars/dataframe.py new file mode 100644 index 00000000000..d4408ff39f0 --- /dev/null +++ b/modin/polars/dataframe.py @@ -0,0 +1,1439 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses ``DataFrame`` class, that is distributed version of ``polars.DataFrame``.""" + +from __future__ import annotations + +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Sequence + +import numpy as np +import pandas +import polars +from pandas.core.dtypes.common import is_list_like + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.pandas import DataFrame as ModinPandasDataFrame +from modin.pandas import Series as ModinPandasSeries +from modin.pandas.io import from_pandas +from modin.polars.base import BasePolarsDataset + +if TYPE_CHECKING: + from modin.polars import Series + from modin.polars.groupby import GroupBy + from modin.polars.lazyframe import LazyFrame + + +class DataFrame(BasePolarsDataset): + + def __init__( + self, + data=None, + schema=None, + *, + schema_overrides=None, + strict=True, + orient=None, + infer_schema_length=100, + nan_to_null=False, + _query_compiler=None, + ) -> None: + """ + Constructor for DataFrame object. + + Args: + data: Data to be converted to DataFrame. + schema: Schema of the data. + schema_overrides: Schema overrides. + strict: Whether to enforce strict schema. + orient: Orientation of the data. + infer_schema_length: Length of the data to infer schema. + nan_to_null: Whether to convert NaNs to nulls. + _query_compiler: Query compiler to use. + """ + if _query_compiler is None: + if isinstance(data, (ModinPandasDataFrame, ModinPandasSeries)): + self._query_compiler: BaseQueryCompiler = data._query_compiler.copy() + else: + self._query_compiler: BaseQueryCompiler = from_pandas( + polars.DataFrame( + data=data, + schema=schema, + schema_overrides=schema_overrides, + strict=strict, + orient=orient, + infer_schema_length=infer_schema_length, + nan_to_null=nan_to_null, + ).to_pandas() + )._query_compiler + else: + self._query_compiler: BaseQueryCompiler = _query_compiler + + def __getitem__(self, item): + """ + Get item from DataFrame. + + Args: + item: Column to get. + + Returns: + Series or DataFrame with the column. + """ + if is_list_like(item): + missing = [i for i in item if i not in self.columns] + if len(missing) > 0: + raise polars.exceptions.ColumnNotFoundError(missing[0]) + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_array(item) + ) + else: + if item not in self.columns: + raise polars.exceptions.ColumnNotFoundError(item) + from .series import Series + + return Series(_query_compiler=self._query_compiler.getitem_array([item])) + + def _to_polars(self) -> polars.DataFrame: + """ + Convert the DataFrame to Polars format. + + Returns: + Polars representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()) + + def _get_columns(self): + """ + Get columns of the DataFrame. + + Returns: + List of columns. + """ + return list(self._query_compiler.columns) + + def _set_columns(self, new_columns): + """ + Set columns of the DataFrame. + + Args: + new_columns: New columns to set. + """ + new_query_compiler = self._query_compiler.copy() + new_query_compiler.columns = new_columns + self._query_compiler = new_query_compiler + + columns = property(_get_columns, _set_columns) + + _sorted_columns_cache = None + + def _get_sorted_columns(self): + if self._sorted_columns_cache is None: + self._sorted_columns_cache = [False] * len(self.columns) + return self._sorted_columns_cache + + def _set_sorted_columns(self, value): + self._sorted_columns_cache = value + + _sorted_columns = property(_get_sorted_columns, _set_sorted_columns) + + @property + def dtypes(self): + """ + Get dtypes of the DataFrame. + + Returns: + List of dtypes. + """ + return polars.from_pandas( + pandas.DataFrame(columns=self.columns).astype(self._query_compiler.dtypes) + ).dtypes + + @property + def flags(self): + """ + Get flags of the DataFrame. + + Returns: + List of flags. + """ + # TODO: Add flags support + return [] + + @property + def height(self): + """ + Get height of the DataFrame. + + Returns: + Number of rows in the DataFrame. + """ + return len(self._query_compiler.index) + + @property + def schema(self): + """ + Get schema of the DataFrame. + + Returns: + OrderedDict of column names and dtypes. + """ + return OrderedDict(zip(self.columns, self.dtypes, strict=True)) + + @property + def shape(self): + """ + Get shape of the DataFrame. + + Returns: + Tuple of (height, width + """ + return self.height, self.width + + @property + def width(self): + """ + Get width of the DataFrame. + + Returns: + Number of columns in the DataFrame. + """ + return len(self.columns) + + def __repr__(self): + """ + Get string representation of the DataFrame. + + Returns: + String representation of the DataFrame. + """ + return repr(polars.from_pandas(self._query_compiler.to_pandas())) + + def max(self, axis=None): + """ + Get the maximum value in each column. + + Args: + axis: Axis to get the maximum value on. + + Returns: + DataFrame with the maximum values. + """ + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=self._query_compiler.max(axis=0) + ) + else: + return self.max_horizontal() + + def max_horizontal(self): + """ + Get the maximum value in each row. + + Returns: + DataFrame with the maximum values. + """ + return self.__constructor__(_query_compiler=self._query_compiler.max(axis=1)) + + def _convert_non_numeric_to_null(self): + """ + Convert non-numeric columns to null. + + Returns: + DataFrame with non-numeric columns converted to null. + """ + non_numeric_cols = [ + c + for c, t in zip(self.columns, self.dtypes, strict=True) + if not t.is_numeric() + ] + if len(non_numeric_cols) > 0: + return self.__constructor__( + _query_compiler=self._query_compiler.write_items( + slice(None), + [self.columns.index(c) for c in non_numeric_cols], + pandas.NA, + need_columns_reindex=False, + ).astype({c: self._query_compiler.dtypes[c] for c in non_numeric_cols}) + ) + return self.copy() + + def mean(self, *, axis=None, null_strategy="ignore"): + """ + Get the mean of each column. + + Args: + axis: Axis to get the mean on. + null_strategy: Strategy to handle null values. + + Returns: + DataFrame with the mean of each column or row. + """ + # TODO: this converts non numeric columns to numeric + obj = self._convert_non_numeric_to_null() + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=obj._query_compiler.mean( + axis=0, + skipna=True if null_strategy == "ignore" else False, + ) + ) + else: + return obj.mean_horizontal( + ignore_nulls=True if null_strategy == "ignore" else False + ) + + def median(self) -> "DataFrame": + """ + Get the median of each column. + + Returns: + DataFrame with the median of each column. + """ + return self.__constructor__( + _query_compiler=self._convert_non_numeric_to_null()._query_compiler.median( + 0 + ) + ) + + def mean_horizontal(self, *, ignore_nulls: bool = True): + """ + Get the mean of each row. + + Args: + ignore_nulls: Whether to ignore null values. + + Returns: + DataFrame with the mean of each row. + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__( + _query_compiler=obj._query_compiler.mean(axis=1, skipna=ignore_nulls) + ) + + def min(self, axis=None): + """ + Get the minimum value in each column. + + Args: + axis: Axis to get the minimum value on. + + Returns: + DataFrame with the minimum values of each row or column. + """ + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=self._query_compiler.min(axis=0) + ) + else: + return self.max_horizontal() + + def min_horizontal(self): + """ + Get the minimum value in each row. + + Returns: + DataFrame with the minimum values of each row. + """ + return self.__constructor__(_query_compiler=self._query_compiler.min(axis=1)) + + def product(self): + """ + Get the product of each column. + + Returns: + DataFrame with the product of each column. + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__(_query_compiler=obj._query_compiler.prod(axis=0)) + + def quantile(self, quantile: float, interpolation="nearest"): + """ + Get the quantile of each column. + + Args: + quantile: Quantile to get. + interpolation: Interpolation method. + + Returns: + DataFrame with the quantile of each column. + """ + obj = self._convert_non_numeric_to_null() + # TODO: interpolation support + return self.__constructor__( + _query_compiler=obj._query_compiler.quantile_for_single_value(quantile) + ) + + def std(self, ddof: int = 1): + """ + Get the standard deviation of each column. + + Args: + ddof: Delta degrees of freedom. + + Returns: + DataFrame with the standard deviation of each column + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__(_query_compiler=obj._query_compiler.std(ddof=ddof)) + + def sum(self, axis: int | None = None, null_strategy="ignore"): + """ + Get the sum of each column. + + Args: + axis: Axis to get the sum on. + null_strategy: Strategy to handle null values. + + Returns: + DataFrame with the sum of each column or row. + """ + obj = self._convert_non_numeric_to_null() + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=obj._query_compiler.sum( + axis=0, + skipna=True if null_strategy == "ignore" else False, + ) + ) + else: + return obj.sum_horizontal( + ignore_nulls=True if null_strategy == "ignore" else False + ) + + def sum_horizontal(self, *, ignore_nulls: bool = True): + """ + Get the sum of each row. + + Args: + ignore_nulls: Whether to ignore null values. + + Returns: + DataFrame with the sum of each row. + """ + # TODO: if there are strings in the row, polars will append numeric values + # this behavior may not be intended so doing this instead (for now) + obj = self._convert_non_numeric_to_null() + return self.__constructor__( + _query_compiler=obj._query_compiler.sum(axis=1, skipna=ignore_nulls) + ) + + def var(self, ddof: int = 1): + """ + Get the variance of each column. + + Args: + ddof: Delta degrees of freedom. + + Returns: + DataFrame with the variance of each column. + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__(_query_compiler=obj._query_compiler.var(ddof=ddof)) + + def approx_n_unique(self): + """ + Get the approximate number of unique values in each column. + + Returns: + DataFrame with the approximate number of unique values in each column. + """ + return self.__constructor__(_query_compiler=self._query_compiler.nunique()) + + def describe(self, percentiles: Sequence[float] | float = (0.25, 0.5, 0.75)): + """ + Get the descriptive statistics of each column. + + Args: + percentiles: Percentiles to get. + + Returns: + DataFrame with the descriptive statistics of each column. + """ + return self.__constructor__( + self.__constructor__( + _query_compiler=self._query_compiler.describe( + percentiles=np.array(percentiles) + ).astype( + { + k: str + for k, v in zip(self.columns, self.dtypes, strict=True) + if v == polars.String + } + ) + ) + .to_pandas() + .loc[ + [ + "count", + # "null_count", TODO: support null_count in describe + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + ] + .reset_index() + .rename({"index": "statistic"}) + ) + + def estimated_size(self, unit="b"): + """ + Get the estimated amount of memory used by the DataFrame. + + Args: + unit: Unit of the memory size. + + Returns: + DataFrame with the extimated memory usage. + """ + return self.__constructor__(_query_compiler=self._query_compiler.memory_usage()) + + def glimpse( + self, + *, + max_items_per_column: int = 10, + max_colname_length: int = 50, + return_as_string: bool = False, + ) -> str | None: + raise NotImplementedError("not yet") + + def n_unique(self, subset=None) -> int: + """ + Get the number of unique values in each column. + + Args: + subset: Columns to get the number of unique values for. + + Returns: + Number of unique values in each column. + """ + if subset is not None: + raise NotImplementedError("not yet") + return ( + self.is_unique()._query_compiler.sum(axis=0).to_pandas().squeeze(axis=None) + ) + + def null_count(self) -> "DataFrame": + """ + Get the number of null values in each column. + + Returns: + DataFrame with the number of null values in each column. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.isna().sum(axis=0) + ) + + def to_pandas(self): + """ + Convert the DataFrame to Pandas format. + + Returns: + modin.pandas representation of the DataFrame. + """ + return ModinPandasDataFrame(query_compiler=self._query_compiler.copy()) + + def group_by( + self, + *by, + maintain_order: bool = False, + **named_by, + ) -> "GroupBy": + """ + Group the DataFrame by the given columns. + + Args: + by: Columns to group by. + maintain_order: Whether to maintain the order of the groups. + named_by: Named columns to group by. + + Returns: + GroupBy object. + """ + from modin.polars.groupby import GroupBy + + return GroupBy(self, *by, maintain_order=maintain_order, **named_by) + + def drop(self, *columns, strict: bool = True) -> "DataFrame": + """ + Drop the given columns. + + Args: + columns: Columns to drop. + strict: Whether to raise an error if a column is not found. + + Returns: + DataFrame with the columns dropped. + """ + if strict: + for c in columns: + if c not in self.columns: + raise KeyError(c) + columns = list(columns) if not isinstance(columns[0], list) else columns[0] + return self.__constructor__(_query_compiler=self._query_compiler.drop(columns)) + + def drop_in_place(self, name: str) -> "DataFrame": + """ + Drop the given column in place and return the dropped column. + + Args: + name: Column to drop. + + Returns: + The column that was dropped from the DataFrame. + """ + col_to_return = self[name] + self._query_compiler = self._query_compiler.drop([name]) + return col_to_return + + def get_column(self, name: str) -> "Series": + """ + Get the column by name. + + Args: + name: Name of the column to get. + + Returns: + Series with the column. + """ + return self[name] + + def get_column_index(self, name: str) -> int: + """ + Find the index of the column by name. + + Args: + name: Name of the column to find. + + Returns: + Index of the column. + """ + return self.columns.index(name) + + def get_columns(self) -> list["Series"]: + """ + Get the columns of the DataFrame. + + Returns: + List of Series with the columns. + """ + return [self[name] for name in self.columns] + + def group_by_dynamic( + self, + index_column, + *, + every, + period, + offset, + truncate, + include_boundaries, + closed, + label, + group_by, + start_by, + check_sorted, + ): + raise NotImplementedError("not yet") + + def hstack(self, columns, *, inplace: bool = False) -> "DataFrame": + """ + Stack the given columns horizontally. + + Args: + columns: Columns to stack. + inplace: Whether to stack the columns in place. + + Returns: + DataFrame with the columns stacked horizontally. + """ + if isinstance(columns, DataFrame): + columns = columns.get_columns() + result_query_compiler = self._query_compiler.concat( + axis=1, other=[c._query_compiler for c in columns] + ) + if inplace: + self._query_compiler = result_query_compiler + return self + return self.__constructor__(_query_compiler=result_query_compiler) + + def insert_column(self, index: int, column: "Series") -> "DataFrame": + """ + Insert the given column at the given index. + + Args: + index: Index to insert the column at. + column: Column to insert. + name: Name of the column to insert. + + Returns: + DataFrame with the column inserted. + """ + return self.__constructor__( + self._query_compiler.insert(index, column.name, column._query_compiler) + ) + + def item(self, row: int | None = None, column: str | int | None = None) -> Any: + """ + Get the value at the given row and column. + + Args: + row: Row to get the value from. + column: Column to get the value from. + + Returns: + Value at the given row and column. + """ + if row is None: + row = 0 + if column is None: + column = 0 + if isinstance(column, str): + column = self.columns.index(column) + return ( + self._query_compiler.take_2d_labels(row, column) + .to_pandas() + .squeeze(axis=None) + ) + + def iter_columns(self) -> Iterator["Series"]: + """ + Iterate over the columns of the DataFrame. + + Returns: + Iterator over the columns. + """ + return iter(self.get_columns()) + + def iter_rows( + self, + *, + named: bool = False, + buffer_size: int = 512, + ) -> Iterator[tuple[Any]] | Iterator[dict[str, Any]]: + """ + Iterate over the rows of the DataFrame. + + Returns: + Iterator over the rows. + """ + raise NotImplementedError("not yet") + + def iter_slices( + self, + n_rows: int = 10000, + ) -> Iterator["DataFrame"]: + """ + Iterate over the slices of the DataFrame. + + Args: + n_rows: Number of rows in each slice. + + Returns: + Iterator over the slices. + """ + raise NotImplementedError("not yet") + + def join( + self, + other: "DataFrame", + on: str | list[str] | None = None, + how: str = "inner", + *, + left_on: str | list[str] | None = None, + right_on: str | list[str] | None = None, + suffix: str = "_right", + validate="m:m", + join_nulls: bool = False, + coalesce: bool | None = None, + ) -> "DataFrame": + """ + Join the DataFrame with another DataFrame. + + Args: + other: DataFrame to join with. + on: Column to join on. + how: How to join the DataFrames. + + Returns: + Joined DataFrame. + """ + if how == "full": + how = "outer" + elif how == "cross": + raise NotImplementedError("not yet") + elif how == "semi": + how = "right" + elif how == "anti": + raise NotImplementedError("not yet") + return self.__constructor__( + _query_compiler=self._query_compiler.merge( + other._query_compiler, + on=on, + how=how, + suffixes=("", suffix), + left_on=left_on, + right_on=right_on, + ) + ) + + def join_asof( + self, + other: "DataFrame", + *, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, + by_left: str | Sequence[str] | None = None, + by_right: str | Sequence[str] | None = None, + by: str | Sequence[str] | None = None, + strategy: str = "backward", + suffix: str = "_right", + tolerance: str, + ) -> "DataFrame": + """ + Join the DataFrame with another DataFrame using asof logic. + + Args: + other: DataFrame to join with. + left_on: Column to join on in the left DataFrame. + right_on: Column to join on in the right DataFrame. + on: Column to join on in both DataFrames. + by_left: Columns to join on in the left DataFrame. + by_right: Columns to join on in the right DataFrame. + by: Columns to join on in both DataFrames. + strategy: Strategy to use for the join. + suffix: Suffix to add to the columns. + tolerance: Tolerance for the join. + + Returns: + Joined DataFrame. + """ + if on is not None and left_on is None and right_on is None: + left_on = right_on = on + if by is not None and by_left is None and by_right is None: + by_left = by_right = by + return self.__constructor__( + _query_compiler=self._query_compiler.merge_asof( + other._query_compiler, + left_on=left_on, + right_on=right_on, + left_by=by_left, + right_by=by_right, + direction=strategy, + suffixes=("", suffix), + tolerance=tolerance, + ) + ) + + def melt( + self, + id_vars=None, + value_vars=None, + variable_name: str | None = None, + value_name: str | None = None, + ) -> "DataFrame": + """ + Melt the DataFrame. + + Args: + id_vars: Columns to keep. + value_vars: Columns to melt. + variable_name: Name of the variable column. + value_name: Name of the value column. + + Returns: + Melted DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.melt( + id_vars=id_vars, + value_vars=value_vars, + var_name=variable_name, + value_name=value_name, + ) + ) + + def merge_sorted(self, other: "DataFrame", on: str | list[str]) -> "DataFrame": + # TODO: support natural join + sort + raise NotImplementedError("not yet") + + def partition_by( + self, + by, + *more_by, + maintain_order: bool = True, + include_key: bool = True, + as_dict: bool = False, + ) -> list["DataFrame"] | dict[Any, "DataFrame"]: + """ + Partition the DataFrame by the given columns. + + Args: + by: Columns to partition by. + more_by: Additional columns to partition by. + maintain_order: Whether to maintain the order of the partitions. + include_key: Whether to include the partition key. + as_dict: Whether to return the partitions as a dictionary. + + Returns: + List of DataFrames or dictionary of DataFrames. + """ + if isinstance(by, str): + by = [by, *more_by] + elif isinstance(by, list): + by = [*by, *more_by] + if as_dict: + return { + k: self.__constructor__(v) + for k, v in self.to_pandas() + .groupby(by, as_index=not include_key) + .groups + } + else: + return [ + self.__constructor__(g) + for g in self.to_pandas().groupby(by, as_index=not include_key) + ] + + def pipe(self, function, *args, **kwargs) -> Any: + return function(self, *args, **kwargs) + + def pivot( + self, + *, + values, + index, + columns, + aggregate_function=None, + maintain_order: bool = True, + sort_columns: bool = False, + separator: str = "_", + ) -> "DataFrame": + """ + Pivot the DataFrame. + + Args: + values: Values to pivot. + index: Index columns. + columns: Columns to pivot. + aggregate_function: Function to aggregate the values. + maintain_order: Whether to maintain the order of the pivot. + sort_columns: Whether to sort the columns. + separator: Separator for the columns. + + Returns: + Pivoted DataFrame. + """ + # TODO: handle maintain_order, sort_columns, separator + return self.__constructor__( + _query_compiler=self._query_compiler.pivot( + values=values, + index=index, + columns=columns, + agg=aggregate_function, + ) + ) + + def rechunk(self) -> "DataFrame": + """ + Rechunk the DataFrame into the given number of partitions. + + Returns: + Rechunked DataFrame. + """ + return self.copy() + + def rename(self, mapping: dict[str, str] | callable) -> "DataFrame": + """ + Rename the columns of the DataFrame. + + Args: + mapping: Mapping of old names to new names. + + Returns: + DataFrame with the columns renamed. + """ + if callable(mapping): + mapping = {c: mapping(c) for c in self.columns} + # TODO: add a query compiler method for `rename` + new_columns = {c: mapping.get(c, c) for c in self.columns} + new_obj = self.copy() + new_obj.columns = new_columns + return new_obj + + def replace_column(self, index: int, column: "Series") -> "DataFrame": + """ + Replace the column at the given index with the new column. + + Args: + index: Index of the column to replace. + column: New column to replace with. + + Returns: + DataFrame with the column replaced. + """ + self._query_compiler = self._query_compiler.drop([self.columns[index]]).insert( + index, + column.name, + column._query_compiler, + ) + return self + + def reverse(self) -> "DataFrame": + """ + Reverse the DataFrame. + + Returns: + Reversed DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array( + slice(None, None, -1) + ) + ) + + def rolling(self, index_column, *, period, offset, closed, group_by, check_sorted): + raise NotImplementedError("not yet") + + def row( + self, index: int | None = None, *, by_predicate=None, named: bool = False + ) -> tuple[Any] | dict[str, Any]: + """ + Get the row at the given index. + + Args: + index: Index of the row to get. + by_predicate: Predicate to get the row by. + named: Whether to return the row as a dictionary. + + Returns: + Row at the given index. + """ + if index is not None: + if named: + return dict(self.to_pandas().iloc[index]) + else: + return tuple(self.to_pandas().iloc[index]) + else: + # TODO: support expressions + raise NotImplementedError("not yet") + + def rows(self, *, named: bool = False) -> list[tuple[Any]] | list[dict[str, Any]]: + raise NotImplementedError("not yet") + + def rows_by_key( + self, + key: Any, + *, + named: bool = False, + include_key: bool = False, + unique: bool = False, + ) -> dict[Any, Iterable[Any]]: + raise NotImplementedError("not yet") + + def select(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def select_seq(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def set_sorted( + self, column: str | Iterable[str], *more_columns: str, descending: bool = False + ) -> "DataFrame": + """ + Set the columns to be sorted. + + Args: + column: Column to sort by. + more_columns: Additional columns to sort by. + descending: Whether to sort in descending order. + + Returns: + DataFrame with the columns sorted. + """ + if len(more_columns) > 0: + if isinstance(column, Iterable): + column = [*column, *more_columns] + else: + column = [column, *more_columns] + if isinstance(column, str): + column = [column] + new_sorted_columns = [c in column for c in self.columns] + obj = self.copy() + obj._sorted_columns = new_sorted_columns + return obj + + def sql(self, query: str, *, table_name: str = "self") -> "DataFrame": + raise NotImplementedError("not yet") + + def to_series(self, index: int = 0) -> "Series": + """ + Convert the DataFrame at index provided to a Series. + + Args: + index: Index of the column to convert to a Series. + + Returns: + Series representation of the DataFrame at index provided. + """ + return self[self.columns[index]] + + def transpose( + self, + *, + include_header: bool = False, + header_name: str = "column", + column_names: str | Sequence[str] | None = None, + ) -> "DataFrame": + """ + Transpose the DataFrame. + + Args: + include_header: Whether to include a header. + header_name: Name of the header. + column_names: Names of the columns. + + Returns: + Transposed DataFrame. + """ + result = self.__constructor__(_query_compiler=self._query_compiler.transpose()) + if column_names is not None: + result.columns = column_names + elif include_header: + result.columns = [f"{header_name}_{i}" for i in range(result.width)] + return result + + def unnest(self, columns, *more_columns) -> "DataFrame": + """ + Unnest the given columns. + + Args: + columns: Columns to unnest. + more_columns: Additional columns to unnest. + + Returns: + DataFrame with the columns unnested. + """ + raise NotImplementedError("not yet") + + def unstack( + self, + step: int, + how: str = "vertical", + columns=None, + fill_values: list[Any] | None = None, + ): + """ + Unstack the DataFrame. + + Args: + step: Step to unstack by. + how: How to unstack the DataFrame. + columns: Columns to unstack. + fill_values: Values to fill the unstacked DataFrame with. + + Returns: + Unstacked DataFrame. + """ + raise NotImplementedError("not yet") + + def update( + self, + other: "DataFrame", + on: str | Sequence[str] | None = None, + how: Literal["left", "inner", "full"] = "left", + *, + left_on: str | Sequence[str] | None = None, + right_on: str | Sequence[str] | None = None, + include_nulls: bool = False, + ) -> "DataFrame": + """ + Update the DataFrame with another DataFrame. + + Args: + other: DataFrame to update with. + on: Column to update on. + how: How to update the DataFrame. + + Returns: + Updated DataFrame. + """ + raise NotImplementedError("not yet") + + def upsample( + self, + time_column: str, + *, + every: str, + offset: str | None = None, + group_by: str | Sequence[str] | None = None, + maintain_order: bool = False, + ) -> "DataFrame": + raise NotImplementedError("not yet") + + def vstack(self, other: "DataFrame", *, in_place: bool = False) -> "DataFrame": + """ + Stack the given DataFrame vertically. + + Args: + other: DataFrame to stack. + in_place: Whether to stack the DataFrames in place. + + Returns: + Stacked DataFrame. + """ + if in_place: + self._query_compiler = self._query_compiler.concat( + axis=0, other=other._query_compiler + ) + return self + else: + return self.__constructor__( + _query_compiler=self._query_compiler.concat( + axis=0, other=other._query_compiler + ) + ) + + def with_columns(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def with_columns_seq(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def with_row_index(self, name: str = "index", offset: int = 0) -> "DataFrame": + """ + Add a row index to the DataFrame. + + Args: + name: Name of the row index. + offset: Offset for the row index. + + Returns: + DataFrame with the row index added. + """ + if offset != 0: + obj = self.copy() + obj.index = obj.index + offset + result = self.__constructor__( + _query_compiler=self._query_compiler.reset_index(drop=False) + ) + result.columns = [name, *self.columns] + return result + + with_row_count = with_row_index + + def map_rows( + self, function: callable, return_dtype=None, *, inference_size: int = 256 + ) -> "DataFrame": + """ + Apply the given function to the DataFrame. + + Args: + function: Function to apply. + return_dtype: Return type of the function. + inference_size: Size of the inference. + + Returns: + DataFrame with the function applied. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.apply(function, axis=1) + ) + + def corr(self, **kwargs: Any) -> "DataFrame": + """ + Compute the correlation of the DataFrame. + + Returns: + DataFrame with the correlation. + """ + return self.__constructor__(_query_compiler=self._query_compiler.corr(**kwargs)) + + def lazy(self) -> "LazyFrame": + """ + Convert the DataFrame to a lazy DataFrame. + + Returns: + Lazy DataFrame. + """ + raise NotImplementedError("not yet") + + @classmethod + def deserialize(cls, source) -> "DataFrame": + """ + Deserialize the DataFrame. + + Args: + source: Source to deserialize. + + Returns: + Deserialized DataFrame. + """ + return cls(polars.DataFrame.deserialize(source)) + + def serialize(self, file=None) -> str | None: + """ + Serialize the DataFrame. + + Args: + file: File to serialize to. + + Returns: + Serialized DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).serialize(file) + + @property + def style(self): + """ + Create a Great Table for styling. + + Returns: + GreatTable object. + """ + return self._to_polars().style + + def to_dict( + self, *, as_series: bool = True + ) -> dict[str, "Series"] | dict[str, list[Any]]: + """ + Convert the DataFrame to a dictionary representation. + + Args: + as_series: Whether to convert the columns to Series. + + Returns: + Dictionary representation of the DataFrame. + """ + if as_series: + return {name: self[name] for name in self.columns} + else: + return polars.from_pandas(self._query_compiler.to_pandas()).to_dict( + as_series=as_series + ) + + def to_dicts(self) -> list[dict[str, Any]]: + """ + Convert the DataFrame to a list of dictionaries. + + Returns: + List of dictionaries. + """ + return self._to_polars().to_dicts() + + def to_init_repr(self, n: int = 1000) -> str: + """ + Get the string representation of the DataFrame for initialization. + + Returns: + String representation of the DataFrame for initialization. + """ + return self._to_polars().to_init_repr(n) + + def to_struct(self, name: str = "") -> "Series": + """ + Convert the DataFrame to a struct. + + Args: + name: Name of the struct. + + Returns: + Series representation of the DataFrame as a struct. + """ + raise NotImplementedError("not yet") + + def unpivot( + self, + on, + *, + index, + variable_name: str | None = None, + value_name: str | None = None, + ) -> "DataFrame": + """ + Unpivot a DataFrame from wide to long format. + + Args: + on: Columns to unpivot. + index: Columns to keep. + variable_name: Name of the variable column. + value_name: Name of the value column. + + Returns: + Unpivoted DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.melt( + on=on, + index=index, + var_name=variable_name, + value_name=value_name, + ) + ) + + write_avro = write_clipboard = write_csv = write_database = write_delta = ( + write_excel + ) = write_ipc = write_ipc_stream = write_json = write_ndjson = write_parquet = ( + write_parquet_partitioned + ) = lambda *args, **kwargs: (_ for _ in ()).throw(NotImplementedError("not yet")) + + def clear(self, n: int = 0) -> "DataFrame": + """ + Create an empty (n=0) or null filled (n>0) DataFrame. + + Args: + n: Number of rows to create. + + Returns: + Empty or null filled DataFrame. + """ + return self.__constructor__(polars.DataFrame(schema=self.schema).clear(n=n)) + + def collect_schema(self) -> dict[str, str]: + """ + Collect the schema of the DataFrame. + + Returns: + Dictionary of the schema. + """ + return self.schema + + def fold(self, operation: callable) -> "Series": + """ + Fold the DataFrame. + + Args: + operation: Operation to fold the DataFrame with. + + Returns: + Series with the folded DataFrame. + """ + raise NotImplementedError("not yet") + + def hash_rows( + self, + seed: int = 0, + seed_1: int | None = None, + seed_2: int | None = None, + seed_3: int | None = None, + ) -> "Series": + raise NotImplementedError("not yet") diff --git a/modin/polars/groupby.py b/modin/polars/groupby.py new file mode 100644 index 00000000000..ec6305a4b2b --- /dev/null +++ b/modin/polars/groupby.py @@ -0,0 +1,247 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Implement GroupBy public API as pandas does.""" + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from modin.polars import DataFrame + + +class GroupBy: + + def __init__( + self, + df: "DataFrame", + *by, + maintain_order: bool = False, + **named_by, + ) -> None: + self.df = df + if len(by) == 1: + self.by = by[0] + else: + if all(isinstance(b, str) and b in self.df.columns for b in by): + self.by = self.df[list(by)]._query_compiler + elif all(isinstance(b, type(self._df._query_compiler)) for b in by): + self.by = by + else: + raise NotImplementedError("not yet") + self.named_by = named_by + self.maintain_order = maintain_order + + def agg(self, *aggs, **named_aggs): + raise NotImplementedError("not yet") + + def all(self): + raise NotImplementedError("not yet") + + def map_groups(self, function) -> "DataFrame": + raise NotImplementedError("not yet") + + apply = map_groups + + def count(self): + return self.len(name="count") + + def first(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_first( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ).reset_index(drop=False) + ) + + def head(self, n: int = 5): + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_head( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs=dict(n=n), + drop=False, + ) + ) + + def last(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_last( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ).reset_index(drop=False) + ) + + def len(self, name: str | None = None) -> "DataFrame": + if name is None: + name = "len" + result = self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_size( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + result._query_compiler.columns = [ + c if c != "size" else name for c in result.columns + ] + return result + + def max(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_max( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + + def mean(self) -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_mean( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True), + drop=False, + ).reset_index(drop=False) + ) + + def median(self) -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_median( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True), + drop=False, + ).reset_index(drop=False) + ) + + def min(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_min( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + + def n_unique(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_nunique( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + + def quantile(self, quantile: float, interpolation="nearest") -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + # TODO: interpolation types not yet supported + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_quantile( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True, q=quantile), + drop=False, + ).reset_index(drop=False) + ) + + def sum(self) -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_sum( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True), + drop=False, + ).reset_index(drop=False) + ) + + def tail(self, n: int = 5): + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_tail( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs=dict(n=n), + drop=False, + ) + ) diff --git a/modin/polars/lazyframe.py b/modin/polars/lazyframe.py new file mode 100644 index 00000000000..8616b6ae15c --- /dev/null +++ b/modin/polars/lazyframe.py @@ -0,0 +1,22 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from modin.polars.base import BasePolarsDataset + + +class LazyFrame(BasePolarsDataset): + """ + Stub for Lazy Frame implementation. + """ + + pass diff --git a/modin/polars/series.py b/modin/polars/series.py new file mode 100644 index 00000000000..8db757908c9 --- /dev/null +++ b/modin/polars/series.py @@ -0,0 +1,2159 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses `Series` class, that is distributed version of `polars.Series`.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Sequence + +import numpy as np +import pandas +import polars +from polars._utils.various import no_default + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.error_message import ErrorMessage +from modin.pandas import Series as ModinPandasSeries +from modin.pandas.io import from_pandas +from modin.polars.base import BasePolarsDataset + +if TYPE_CHECKING: + from numpy.typing import ArrayLike + from polars import PolarsDataType + + from modin.polars import DataFrame + + +class Series(BasePolarsDataset): + def __init__( + self, + name: str | "ArrayLike" | None = None, + values: "ArrayLike" | None = None, + dtype: "PolarsDataType | None" = None, + *, + strict: "bool" = True, + nan_to_null: "bool" = False, + dtype_if_empty: "PolarsDataType" = polars.Null, + _query_compiler: BaseQueryCompiler | None = None, + ) -> None: + if _query_compiler is None: + if isinstance(values, ModinPandasSeries): + self._query_compiler = values._query_compiler.copy() + else: + self._query_compiler: BaseQueryCompiler = from_pandas( + polars.Series( + name=name, + values=values, + dtype=dtype, + strict=strict, + nan_to_null=nan_to_null, + dtype_if_empty=dtype_if_empty, + ) + .to_pandas() + .to_frame() + )._query_compiler + else: + self._query_compiler: BaseQueryCompiler = _query_compiler + + def __repr__(self): + return repr( + polars.from_pandas(self._query_compiler.to_pandas().squeeze(axis=1)) + ) + + _sorted = False + _descending = None + + def to_pandas(self) -> ModinPandasSeries: + return ModinPandasSeries(query_compiler=self._query_compiler) + + def arg_max(self) -> int: + """ + Get the index of the maximum value. + + Returns: + Index of the maximum value. + """ + return self.to_pandas().argmax() + + def arg_min(self) -> int: + """ + Get the index of the minimum value. + + Returns: + Index of the minimum value. + """ + return self.to_pandas().argmin() + + def implode(self) -> "Series": + """ + Aggregate values into a list. + + Returns: + Imploded Series. + """ + raise NotImplementedError("not yet") + + def max(self) -> Any: + """ + Get the maximum value. + + Returns: + Maximum value. + """ + return self.to_pandas().max() + + def min(self) -> Any: + """ + Get the minimum value. + + Returns: + Minimum value. + """ + return self.to_pandas().min() + + def mean(self) -> Any: + """ + Get the mean value. + + Returns: + Mean value. + """ + return self.to_pandas().mean() + + def median(self) -> Any: + """ + Get the median value. + + Returns: + Median value. + """ + return self.to_pandas().median() + + def mode(self) -> Any: + """ + Get the mode value. + + Returns: + Mode value. + """ + return self.to_pandas().mode() + + def nan_max(self) -> Any: + """ + Get the maximum value, ignoring NaN values. + + Returns: + Maximum value. + """ + return self.to_pandas().max(skipna=True) + + def nan_min(self) -> Any: + """ + Get the minimum value, ignoring NaN values. + + Returns: + Minimum value. + """ + return self.to_pandas().min(skipna=True) + + def product(self) -> Any: + """ + Get the product of all values. + + Returns: + Product of all values. + """ + return self.to_pandas().product() + + def quantile(self, quantile: float, interpolation: str = "nearest") -> float | None: + """ + Get the quantile value. + + Args: + quantile: Quantile to calculate. + interpolation: Interpolation method. + + Returns: + Quantile value. + """ + return self.to_pandas().quantile(quantile, interpolation=interpolation) + + def std(self, ddof: int = 1) -> float: + """ + Get the standard deviation. + + Args: + ddof: Delta Degrees of Freedom. + + Returns: + Standard deviation. + """ + return self.to_pandas().std(ddof=ddof) + + def sum(self) -> Any: + """ + Get the sum of all values. + + Returns: + Sum of all values. + """ + return self.to_pandas().sum() + + def var(self, ddof: int = 1) -> float: + """ + Get the variance. + + Args: + ddof: Delta Degrees of Freedom. + + Returns: + Variance. + """ + return self.to_pandas().var(ddof=ddof) + + @property + def arr(self) -> polars.series.array.ArrayNameSpace: + """ + Get the underlying array. + + Returns: + Underlying array. + """ + return polars.from_pandas(self._query_compiler.to_pandas().squeeze(axis=1)).arr + + @property + def dtype(self) -> polars.datatypes.DataType: + """ + Get the data type. + + Returns: + Data type. + """ + return polars.from_pandas( + pandas.Series().astype(self._query_compiler.dtypes.iloc[0]) + ).dtype + + @property + def name(self) -> str: + """ + Get the name. + + Returns: + Name. + """ + return self._query_compiler.columns[0] + + @property + def shape(self) -> tuple[int]: + """ + Get the shape. + + Returns: + Shape. + """ + return (len(self._query_compiler.index),) + + flags = [] + + @property + def bin(self): + raise NotImplementedError("not yet") + + def all(self) -> bool: + """ + Check if all values are True. + + Returns: + True if all values are True, False otherwise. + """ + return self.to_pandas().all() + + def any(self) -> bool: + """ + Check if any value is True. + + Returns: + True if any value is True, False otherwise. + """ + return self.to_pandas().any() + + def not_(self) -> "Series": + """ + Negate the values. + + Returns: + Negated Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.invert()) + + @property + def cat(self): + raise NotImplementedError("not yet") + + def abs(self) -> "Series": + """ + Get the absolute values. + + Returns: + Absolute values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.abs()) + + def arccos(self) -> "Series": + """ + Get the arc cosine values. + + Returns: + Arc cosine values Series. + """ + raise NotImplementedError("not yet") + + def arccosh(self) -> "Series": + """ + Get the hyperbolic arc cosine values. + + Returns: + Hyperbolic arc cosine values Series. + """ + raise NotImplementedError("not yet") + + def arcsin(self) -> "Series": + """ + Get the arc sine values. + + Returns: + Arc sine values Series. + """ + raise NotImplementedError("not yet") + + def arcsinh(self) -> "Series": + """ + Get the hyperbolic arc sine values. + + Returns: + Hyperbolic arc sine values Series. + """ + raise NotImplementedError("not yet") + + def arctan(self) -> "Series": + """ + Get the arc tangent values. + + Returns: + Arc tangent values Series. + """ + raise NotImplementedError("not yet") + + def arctanh(self) -> "Series": + """ + Get the hyperbolic arc tangent values. + + Returns: + Hyperbolic arc tangent values Series. + """ + raise NotImplementedError("not yet") + + def arg_true(self) -> "Series": + """ + Get the index of the first True value. + + Returns: + Index of the first True value. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.reset_index(drop=False) + .getitem_array(self._query_compiler) + .getitem_column_array(0, numeric=True) + ).rename(self.name) + + def arg_unique(self) -> "Series": + """ + Get the index of the first unique value. + + Returns: + Index of the first unique value. + """ + raise NotImplementedError("not yet") + + def cbrt(self) -> "Series": + """ + Get the cube root values. + + Returns: + Cube root values Series. + """ + raise NotImplementedError("not yet") + + def cos(self) -> "Series": + """ + Get the cosine values. + + Returns: + Cosine values Series. + """ + raise NotImplementedError("not yet") + + def cosh(self) -> "Series": + """ + Get the hyperbolic cosine values. + + Returns: + Hyperbolic cosine values Series. + """ + raise NotImplementedError("not yet") + + def cot(self) -> "Series": + """ + Get the cotangent values. + + Returns: + Cotangent values Series. + """ + raise NotImplementedError("not yet") + + def cum_count(self) -> "Series": + """ + Get the cumulative count values. + + Returns: + Cumulative count values Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.isna().cumsum() + ) + + def cum_max(self) -> "Series": + """ + Get the cumulative maximum values. + + Returns: + Cumulative maximum values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cummax()) + + def cum_min(self) -> "Series": + """ + Get the cumulative minimum values. + + Returns: + Cumulative minimum values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cummin()) + + def cum_prod(self) -> "Series": + """ + Get the cumulative product values. + + Returns: + Cumulative product values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cumprod()) + + def cum_sum(self) -> "Series": + """ + Get the cumulative sum values. + + Returns: + Cumulative sum values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cumsum()) + + def cumulative_eval( + self, expr, min_periods: int = 1, *, parallel: bool = False + ) -> "Series": + """ + Get the cumulative evaluation values. + + Args: + expr: Expression to evaluate. + min_periods: Minimum number of periods. + + Returns: + Cumulative evaluation values Series. + """ + raise NotImplementedError("not yet") + + def diff(self, n: int = 1, null_behavior: str = "ignore") -> "Series": + """ + Calculate the first discrete difference between shifted items. + + Args: + n: Number of periods to shift. + null_behavior: Null behavior. + + Returns: + Difference values Series. + """ + raise NotImplementedError("not yet") + + def dot(self, other) -> int | float | None: + """ + Calculate the dot product. + + Args: + other: Other Series. + + Returns: + Dot product. + """ + if isinstance(other, Series): + other = other.to_pandas() + return self.to_pandas().dot(other) + + def entropy( + self, base: float = 2.718281828459045, *, normalize: bool = False + ) -> float: + """ + Calculate the entropy. + + Args: + base: Logarithm base. + normalize: Normalize the entropy. + + Returns: + Entropy. + """ + raise NotImplementedError("not yet") + + def ewm_mean( + self, + com: int | None = None, + span: int | None = None, + half_life: int | None = None, + alpha: float | None = None, + *, + adjust: bool = True, + min_periods: int = 1, + ignore_nulls: bool | None = None, + ) -> "Series": + """ + Calculate the exponential weighted mean. + + Args: + com: Center of mass. + span: Span. + + Returns: + Exponential weighted mean Series. + """ + return self.__constructor__( + self.to_pandas() + .ewm( + com=com, + span=span, + halflife=half_life, + alpha=alpha, + adjust=adjust, + min_periods=min_periods, + ignore_na=ignore_nulls, + ) + .mean() + ) + + def ewm_mean_by(self, by, *, half_life: int | None = None) -> "Series": + """ + Calculate the exponential weighted mean by group. + + Args: + by: Grouping Series. + + Returns: + Exponential weighted mean Series. + """ + raise NotImplementedError("not yet") + + def ewm_std( + self, + com: int | None = None, + span: int | None = None, + half_life: int | None = None, + alpha: float | None = None, + *, + adjust: bool = True, + min_periods: int = 1, + ignore_nulls: bool | None = None, + ) -> "Series": + """ + Calculate the exponential weighted standard deviation. + + Args: + com: Center of mass. + span: Span. + + Returns: + Exponential weighted standard deviation Series. + """ + return self.__constructor__( + self.to_pandas() + .ewm( + com=com, + span=span, + halflife=half_life, + alpha=alpha, + adjust=adjust, + min_periods=min_periods, + ignore_na=ignore_nulls, + ) + .std() + ) + + def ewm_var( + self, + com: int | None = None, + span: int | None = None, + half_life: int | None = None, + alpha: float | None = None, + *, + adjust: bool = True, + min_periods: int = 1, + ignore_nulls: bool | None = None, + ) -> "Series": + """ + Calculate the exponential weighted variance. + + Args: + com: Center of mass. + span: Span. + + Returns: + Exponential weighted variance Series. + """ + return self.__constructor__( + self.to_pandas() + .ewm( + com=com, + span=span, + halflife=half_life, + alpha=alpha, + adjust=adjust, + min_periods=min_periods, + ignore_na=ignore_nulls, + ) + .var() + ) + + def exp(self) -> "Series": + """ + Calculate the exponential values. + + Returns: + Exponential values Series. + """ + return self.__constructor__(self.to_pandas().exp()) + + def hash( + self, + seed: int = 0, + seed_1: int | None = None, + seed_2: int | None = None, + seed_3: int | None = None, + ) -> "Series": + """ + Calculate the hash values. + + Args: + seed: Seed. + seed_1: Seed 1. + seed_2: Seed 2. + seed_3: Seed 3. + + Returns: + Hash values Series. + """ + raise NotImplementedError("not yet") + + def hist( + self, + bins: list[float] | None = None, + *, + bin_count: int | None = None, + include_category: bool = True, + include_breakpoint: bool = True, + ) -> "Series": + """ + Calculate the histogram. + + Args: + bins: Bins. + bin_count: Bin count. + + Returns: + Histogram Series. + """ + raise NotImplementedError("not yet") + + def is_between(self, lower_bound, upper_bound, closed: str = "both") -> "Series": + """ + Check if values are between the bounds. + + Args: + lower_bound: Lower bound. + upper_bound: Upper bound. + closed: Closed bounds. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def kurtosis(self, *, fisher: bool = True, bias: bool = True) -> float | None: + """ + Calculate the kurtosis. + + Args: + fisher: Fisher method. + bias: Bias method. + + Returns: + Kurtosis. + """ + return self.to_pandas().kurtosis(fisher=fisher, bias=bias) + + def log(self, base: float = 2.718281828459045) -> "Series": + """ + Calculate the logarithm values. + + Args: + base: Logarithm base. + + Returns: + Logarithm values Series. + """ + raise NotImplementedError("not yet") + + def log10(self) -> "Series": + """ + Calculate the base 10 logarithm values. + + Returns: + Base 10 logarithm values Series. + """ + return self.log(10) + + def log1p(self) -> "Series": + """ + Calculate the natural logarithm of 1 plus the values. + + Returns: + Natural logarithm of 1 plus the values Series. + """ + raise NotImplementedError("not yet") + + def replace( + self, + mapping: dict[Any, Any], + *, + default: Any = None, + return_dtype=None, + ) -> "Series": + """ + Map values to other values. + + Args: + mapping: Mapping. + + Returns: + Mapped Series. + """ + return self.__constructor__( + self.to_pandas().apply(lambda x: mapping.get(x, default)) + ) + + def pct_change(self, n: int = 1) -> "Series": + """ + Calculate the percentage change. + + Args: + n: Number of periods to shift. + + Returns: + Percentage change Series. + """ + return self.__constructor__(self.to_pandas().pct_change(n)) + + def peak_max(self) -> "Series": + """ + Get the peak maximum values. + + Returns: + Peak maximum values Series. + """ + return self.__eq__(self.max()) + + def peak_min(self) -> "Series": + """ + Get the peak minimum values. + + Returns: + Peak minimum values Series. + """ + return self.__eq__(self.min()) + + def rank( + self, + method: str = "average", + *, + descending: bool = False, + seed: int | None = None, + ) -> "Series": + """ + Calculate the rank. + + Args: + method: Rank method. + + Returns: + Rank Series. + """ + # TODO: support seed + if method not in ["average", "min", "max", "first", "dense"]: + raise ValueError(f"method {method} not supported") + return self.__constructor__( + self.to_pandas().rank(method=method, ascending=not descending) + ) + + def rolling_map( + self, + function: callable, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .apply(function) + ) + + def rolling_max( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling maximum function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .max() + ) + + def rolling_mean( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling mean function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .mean() + ) + + def rolling_median( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling median function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .median() + ) + + def rolling_min( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling minimum function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .min() + ) + + def rolling_quantile( + self, + window_size: int, + quantile: float, + interpolation: str = "nearest", + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling quantile function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .quantile(quantile, interpolation=interpolation) + ) + + def rolling_skew(self, window_size: int, *, bias: bool = False) -> "Series": + """ + Apply a rolling skewness function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + return self.__constructor__(self.to_pandas().rolling(window=window_size).skew()) + + def rolling_std( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ddof: int = 1, + ) -> "Series": + """ + Apply a rolling standard deviation function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .std(ddof=ddof) + ) + + def rolling_sum( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling sum function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .sum() + ) + + def rolling_var( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ddof: int = 1, + ) -> "Series": + """ + Apply a rolling variance function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .var(ddof=ddof) + ) + + def search_sorted(self, element, side: str = "any") -> int | "Series": + """ + Search for the element in the sorted Series. + + Args: + element: Element to search. + side: Side to search. + + Returns: + Index of the element. + """ + if side == "any": + side = "left" + return self.__constructor__(self.to_pandas().searchsorted(element, side=side)) + + def sign(self) -> "Series": + """ + Get the sign values. + + Returns: + Sign values Series. + """ + return self.__lt__(0).__mul__(-1).__add__(self.__gt__(0)) + + def sin(self) -> "Series": + """ + Get the sine values. + + Returns: + Sine values Series. + """ + raise NotImplementedError("not yet") + + def sinh(self) -> "Series": + """ + Get the hyperbolic sine values. + + Returns: + Hyperbolic sine values Series. + """ + raise NotImplementedError("not yet") + + def skew(self, *, bias: bool = True) -> float: + """ + Calculate the skewness. + + Args: + bias: Bias method. + + Returns: + Skewness. + """ + return self.to_pandas().skew() + + def sqrt(self) -> "Series": + """ + Get the square root values. + + Returns: + Square root values Series. + """ + return self.__constructor__(self.to_pandas().sqrt()) + + def tan(self) -> "Series": + """ + Get the tangent values. + + Returns: + Tangent values Series. + """ + raise NotImplementedError("not yet") + + def tanh(self) -> "Series": + """ + Get the hyperbolic tangent values. + + Returns: + Hyperbolic tangent values Series. + """ + raise NotImplementedError("not yet") + + def chunk_lengths(self) -> list[int]: + """ + Get the chunk lengths. + + Returns: + Chunk lengths. + """ + raise NotImplementedError("not yet") + + def describe( + self, + percentiles: Sequence[float] | float | None = (0.25, 0.5, 0.75), + interpolation: str = "nearest", + ): + """ + Generate descriptive statistics. + + Args: + percentiles: Percentiles to calculate. + + Returns: + Descriptive statistics. + """ + return self.to_pandas().describe(percentiles=percentiles) + + def estimated_size(self) -> int: + """ + Get the estimated size. + + Returns: + Estimated size. + """ + return self.to_pandas().memory_usage(index=False) + + def has_nulls(self) -> bool: + """ + Check if there are null values. + + Returns: + True if there are null values, False otherwise. + """ + return self.to_pandas().isnull().any() + + has_validity = has_nulls + + def is_finite(self) -> "Series": + """ + Check if the values are finite. + + Returns: + True if the values are finite, False otherwise. + """ + return self.__ne__(np.inf) + + def is_first_distinct(self) -> "Series": + """ + Check if the values are the first occurrence. + + Returns: + True if the values are the first occurrence, False otherwise. + """ + raise NotImplementedError("not yet") + + def is_in(self, other: "Series" | list[Any]) -> "Series": + """ + Check if the values are in the other Series. + + Args: + other: Other Series. + + Returns: + True if the values are in the other Series, False otherwise. + """ + return self.__constructor__(self.to_pandas().isin(other)) + + def is_infinite(self) -> "Series": + """ + Check if the values are infinite. + + Returns: + True if the values are infinite, False otherwise. + """ + return self.__eq__(np.inf) + + def is_last_distinct(self) -> "Series": + """ + Check if the values are the last occurrence. + + Returns: + True if the values are the last occurrence, False otherwise. + """ + raise NotImplementedError("not yet") + + def is_nan(self) -> "Series": + """ + Check if the values are NaN. + + Returns: + True if the values are NaN, False otherwise. + """ + return self.__constructor__(_query_compiler=self._query_compiler.isna()) + + def is_not_nan(self) -> "Series": + """ + Check if the values are not NaN. + + Returns: + True if the values are not NaN, False otherwise. + """ + return self.__constructor__(_query_compiler=self._query_compiler.notna()) + + def is_not_null(self) -> "Series": + """ + Check if the values are not null. + + Returns: + True if the values are not null, False otherwise. + """ + return self.is_not_nan() + + def is_null(self) -> "Series": + """ + Check if the values are null. + + Returns: + True if the values are null, False otherwise. + """ + return self.is_nan() + + def is_sorted( + self, + *, + descending: bool = False, + nulls_last: bool = False, + ) -> bool: + """ + Check if the values are sorted. + + Args: + descending: Descending order. + + Returns: + True if the values are sorted, False otherwise. + """ + return ( + self.to_pandas().is_monotonic_increasing + if not descending + else self.to_pandas().is_monotonic_decreasing + ) + + def len(self) -> int: + """ + Get the length of the values. + + Returns: + Length of the values Series. + """ + return len(self.to_pandas()) + + def lower_bound(self) -> "Series": + """ + Get the lower bound values. + + Returns: + Lower bound values Series. + """ + raise NotImplementedError("not yet") + + def null_count(self) -> int: + """ + Get the number of null values. + + Returns: + Number of null values. + """ + return self.to_pandas().isnull().sum() + + def unique_counts(self) -> "Series": + """ + Get the unique counts. + + Returns: + Unique counts. + """ + return self.__constructor__(values=self.to_pandas().value_counts()) + + def upper_bound(self) -> "Series": + """ + Get the upper bound values. + + Returns: + Upper bound values Series. + """ + raise NotImplementedError("not yet") + + def value_counts( + self, *, sort: bool = False, parallel: bool = False, name: str = "count" + ) -> "DataFrame": + """ + Get the value counts. + + Returns: + Value counts. + """ + from modin.polars import DataFrame + + return DataFrame( + self.to_pandas().value_counts(sort=sort).reset_index(drop=False, names=name) + ) + + def to_frame(self, name: str | None = None) -> "DataFrame": + """ + Convert the Series to a DataFrame. + + Args: + name: Name of the Series. + + Returns: + DataFrame representation of the Series. + """ + from modin.polars import DataFrame + + return DataFrame(_query_compiler=self._query_compiler).rename({self.name: name}) + + def to_init_repr(self, n: int = 1000) -> str: + """ + Convert Series to instantiatable string representation. + + Args: + n: First n elements. + + Returns: + Instantiatable string representation. + """ + return polars.from_pandas( + self.slice(0, n)._query_compiler.to_pandas() + ).to_init_repr() + + @property + def list(self): + # TODO: implement list object + # https://docs.pola.rs/api/python/stable/reference/series/list.html + raise NotImplementedError("not yet") + + def alias(self, name: str) -> "Series": + """ + Rename the Series. + + Args: + name: New name. + + Returns: + Renamed Series. + """ + return self.to_frame(name).to_series() + + def append(self, other: "Series") -> "Series": + """ + Append another Series. + + Args: + other: Other Series. + + Returns: + Appended Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.concat(0, other._query_compiler) + ) + + def arg_sort( + self, *, descending: bool = False, nulls_last: bool = False + ) -> "Series": + """ + Get the sorted indices. + + Args: + descending: Descending order. + + Returns: + Sorted indices Series. + """ + # TODO: implement nulls_last + result = self.__constructor__(values=self.to_pandas().argsort()) + if descending: + return result.reverse() + else: + return result + + def ceil(self) -> "Series": + """ + Get the ceiling values. + + Returns: + Ceiling values Series. + """ + raise NotImplementedError("not yet") + + def clear(self, n: int = 0) -> "Series": + """ + Create an empty copy of the current Series, with zero to ‘n’ elements. + + Args: + n: Number of elements. + + Returns: + Series will n nulls. + """ + raise NotImplementedError("not yet") + + def clip(self, lower_bound=None, upper_bound=None) -> "Series": + """ + Clip the values. + + Args: + lower_bound: Lower bound. + upper_bound: Upper bound. + + Returns: + Clipped values Series. + """ + return self.__constructor__( + values=self.to_pandas().clip(lower_bound, upper_bound) + ) + + def cut( + self, + breaks: Sequence[float], + *, + labels: list[str] | None = None, + break_point_label: str = "breakpoint", + left_closed: bool = False, + include_breaks: bool = False, + as_series: bool = True, + ) -> "BasePolarsDataset": + raise NotImplementedError("not yet") + + def extend_constant(self, value) -> "Series": + """ + Extend the Series with a constant value. + + Args: + value: Constant value. + + Returns: + Extended Series. + """ + raise NotImplementedError("not yet") + + def floor(self) -> "BasePolarsDataset": + return self.__floordiv__(1) + + def gather(self, indices) -> "Series": + """ + Gather values by indices. + + Args: + indices: Indices. + + Returns: + Gathered Series. + """ + return self.__constructor__( + values=self.to_pandas().iloc[ + ( + indices._query_compiler + if hasattr(indices, "_query_compiler") + else indices + ) + ] + ) + + def interpolate_by(self, by) -> "Series": + """ + Interpolate values by group. + + Args: + by: Grouping Series. + + Returns: + Interpolated Series. + """ + raise NotImplementedError("not yet") + + def item(self, index: int | None = None) -> Any: + """ + Get the item at the index. + + Args: + index: Index. + + Returns: + Item at the index. + """ + return self.to_pandas().iloc[index] + + def new_from_index(self, index: int, length: int) -> "Series": + """ + Create a new Series from the index. + + Args: + index: Index. + length: Length. + + Returns: + New Series. + """ + raise NotImplementedError("not yet") + + def qcut( + self, + quantiles: Sequence[float] | int, + *, + labels: Sequence[str] | None = None, + left_closed: bool = False, + allow_duplicates: bool = False, + include_breaks: bool = False, + break_point_label: str = "breakpoint", + category_labels: str = "category", + as_series: bool = True, + ) -> "Series" | "DataFrame": + """ + Bin continuous values into discrete categories based on quantiles. + + Args: + quantiles: Number of quantiles or sequence of quantiles. + labels: Labels for the resulting bins. + left_closed: Whether the intervals are left-closed. + allow_duplicates: Whether to allow duplicate intervals. + include_breaks: Whether to include the breaks in the result. + break_point_label: Label for the break points. + category_labels: Label for the categories. + as_series: Whether to return a Series. + + Returns: + Binned Series. + """ + raise NotImplementedError("not yet") + + def rechunk(self, *, in_place: bool = False) -> "Series": + """ + Rechunk the Series. + + Args: + in_place: In-place operation. + + Returns: + Rechunked Series. + """ + raise NotImplementedError("not yet") + + rename = alias + + def reshape(self, dimensions, nested_type) -> "Series": + """ + Reshape the Series. + + Args: + dimensions: Dimensions. + nested_type: Nested type. + + Returns: + Reshaped Series. + """ + raise NotImplementedError("not yet") + + def reverse(self) -> "Series": + """ + Reverse the Series. + + Returns: + Reversed Series. + """ + return self.__constructor__(values=self.to_pandas().iloc[::-1]) + + def rle(self) -> "Series": + """ + Run-length encode the Series. + + Returns: + Run-length encoded Series. + """ + raise NotImplementedError("not yet") + + def rle_id(self) -> "Series": + """ + Run-length encode the Series with IDs. + + Returns: + Run-length encoded Series with IDs. + """ + raise NotImplementedError("not yet") + + def round(self, decimals: int = 0) -> "Series": + """ + Round the values. + + Args: + decimals: Number of decimals. + + Returns: + Rounded values Series. + """ + return self.__constructor__(values=self.to_pandas().round(decimals)) + + def round_sig_figs(self, digits: int) -> "Series": + """ + Round the values to significant figures. + + Args: + digits: Number of significant figures. + + Returns: + Rounded values Series. + """ + raise NotImplementedError("not yet") + + def scatter(self, indices, values) -> "Series": + """ + Scatter values by indices. + + Args: + indices: Indices. + values: Values. + + Returns: + Scattered Series. + """ + raise NotImplementedError("not yet") + + def set(self, filter: "Series", value: int | float | str | bool | None) -> "Series": + """ + Set values by filter. + + Args: + filter: Filter. + value: Value. + + Returns: + Set Series. + """ + raise NotImplementedError("not yet") + + def shrink_dtype(self) -> "Series": + """ + Shrink the data type. + + Returns: + Shrunk Series. + """ + raise NotImplementedError("not yet") + + def shuffle(self, seed: int | None = None) -> "Series": + """ + Shuffle the Series. + + Args: + seed: Seed. + + Returns: + Shuffled Series. + """ + raise NotImplementedError("not yet") + + def zip_with(self, mask: "Series", other: "Series") -> "Series": + """ + Zip the Series with another Series. + + Args: + mask: Mask Series. + other: Other Series. + + Returns: + Zipped Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.where( + mask._query_compiler, other._query_compiler + ) + ) + + def map_elements( + self, + function: callable, + return_dtype=None, + *, + skip_nulls: bool = True, + ) -> "Series": + """ + Map the elements. + + Args: + function: Function to apply. + + Returns: + Mapped Series. + """ + if return_dtype is not None or skip_nulls is False: + ErrorMessage.warn( + "`return_dtype` and `skip_nulls=False` are not supported yet" + ) + return self.__constructor__(values=self.to_pandas().apply(function)) + + def reinterpret(self, *, signed: bool = True) -> "Series": + """ + Reinterpret the data type of the series as signed or unsigned. + + Args: + signed: If True, reinterpret as signed, otherwise as unsigned. + + Returns: + Reinterpreted Series. + """ + raise NotImplementedError("not yet") + + def set_sorted(self, *, descending: bool = False) -> "Series": + """ + Set the Series as sorted. + + Args: + descending: Descending order. + + Returns: + Sorted Series. + """ + self._sorted = True + self._descending = descending + return self + + def to_physical(self) -> "Series": + """ + Convert the Series to physical. + + Returns: + Physical Series. + """ + raise NotImplementedError("not yet") + + def get_chunks(self) -> list["Series"]: + """ + Get the chunks. + + Returns: + Chunks. + """ + raise NotImplementedError("not yet") + + @property + def str(self): + # TODO: implement str object + # https://docs.pola.rs/api/python/stable/reference/series/string.html + raise NotImplementedError("not yet") + + @property + def struct(self): + # TODO: implement struct object + # https://docs.pola.rs/api/python/stable/reference/series/struct.html + raise NotImplementedError("not yet") + + @property + def dt(self): + # TODO: implement dt object + # https://docs.pola.rs/api/python/stable/reference/series/temporal.html + raise NotImplementedError("not yet") + + def __len__(self) -> int: + """ + Get the length of the Series. + """ + return self.len() + + def __matmul__(self, other) -> "Series": + """ + Matrix multiplication. + + Args: + other: Other Series. + + Returns: + Matrix multiplication Series. + """ + raise NotImplementedError("not yet") + + def __radd__(self, other) -> "Series": + """ + Right addition. + + Args: + other: Other Series. + + Returns: + Added Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.radd(other, axis=0) + ) + + def __rand__(self, other) -> "Series": + """ + Right and. + + Args: + other: Other Series. + + Returns: + And Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__rand__(other, axis=0) + ) + + def __rfloordiv__(self, other) -> "Series": + """ + Right floor division. + + Args: + other: Other Series. + + Returns: + Floored Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rfloordiv(other, axis=0) + ) + + def __rmatmul__(self, other) -> "Series": + """ + Right matrix multiplication. + + Args: + other: Other Series. + + Returns: + Matrix multiplication Series. + """ + raise NotImplementedError("not yet") + + def __rmod__(self, other) -> "Series": + """ + Right modulo. + + Args: + other: Other Series. + + Returns: + Modulo Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rmod(other, axis=0) + ) + + def __rmul__(self, other) -> "Series": + """ + Right multiplication. + + Args: + other: Other Series. + + Returns: + Multiplied Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rmul(other, axis=0) + ) + + def __ror__(self, other) -> "Series": + """ + Right or. + + Args: + other: Other Series. + + Returns: + Or Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__ror__(other, axis=0) + ) + + def __rpow__(self, other) -> "Series": + """ + Right power. + + Args: + other: Other Series. + + Returns: + Powered Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rpow(other, axis=0) + ) + + def __rsub__(self, other) -> "Series": + """ + Right subtraction. + + Args: + other: Other Series. + + Returns: + Subtracted Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rsub(other, axis=0) + ) + + def __rtruediv__(self, other) -> "Series": + """ + Right true division. + + Args: + other: Other Series. + + Returns: + Divided Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rtruediv(other, axis=0) + ) + + def __rxor__(self, other) -> "Series": + """ + Right xor. + + Args: + other: Other Series. + + Returns: + Xor Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__rxor__(other, axis=0) + ) + + def eq(self, other) -> "Series": + """ + Check if the values are equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.eq(other._query_compiler) + ) + + def eq_missing(self, other) -> "Series": + """ + Check if the values are equal to the other Series, including missing values. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def ge(self, other) -> "Series": + """ + Check if the values are greater than or equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.ge(other._query_compiler) + ) + + def gt(self, other) -> "Series": + """ + Check if the values are greater than the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.gt(other._query_compiler) + ) + + def le(self, other) -> "Series": + """ + Check if the values are less than or equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.le(other._query_compiler) + ) + + def lt(self, other) -> "Series": + """ + Check if the values are less than the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.lt(other._query_compiler) + ) + + def n_unique(self) -> int: + """ + Get the number of unique values. + + Returns: + Number of unique values. + """ + return self._query_compiler.nunique().to_pandas().squeeze(axis=None) + + def ne(self, other) -> "Series": + """ + Check if the values are not equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.ne(other._query_compiler) + ) + + def ne_missing(self, other) -> "Series": + """ + Check if the values are not equal to the other Series, including missing values. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def pow(self, exponent) -> "Series": + """ + Raise the values to the power of the exponent. + + Args: + exponent: Exponent. + + Returns: + Powered Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.pow(exponent, axis=0) + ) + + def replace_strict( + self, old, new=no_default, *, default=no_default, return_dtype=None + ) -> "Series": + """ + Replace values strictly. + + Args: + old: Old values. + new: New values. + default: Default value. + + Returns: + Replaced Series. + """ + raise NotImplementedError("not yet") + + def to_list(self) -> list: + """ + Convert the Series to a list. + + Returns: + List representation of the Series. + """ + return self._to_polars().tolist() + + def drop_nans(self) -> "Series": + """ + Drop NaN values. + + Returns: + Series without NaN values. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.dropna(how="any") + ) diff --git a/modin/tests/polars/test_dataframe.py b/modin/tests/polars/test_dataframe.py new file mode 100644 index 00000000000..29936c0b0f7 --- /dev/null +++ b/modin/tests/polars/test_dataframe.py @@ -0,0 +1,25 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import polars +import polars.testing + +import modin.polars as pl + + +def test_init_roundtrip(): + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + df = pl.DataFrame(data) + polars_df = polars.DataFrame(data) + to_polars = polars.from_pandas(df._query_compiler.to_pandas()) + polars.testing.assert_frame_equal(polars_df, to_polars) From a40cef7f54c54571008346a7e7882add12ac9dc1 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Mon, 29 Jul 2024 09:11:16 -0700 Subject: [PATCH 03/20] FEAT-#7340: Add more granular lazy flags to query compiler (#7348) Signed-off-by: Jonathan Shi --- .../storage_formats/base/query_compiler.py | 45 ++++++++++--- .../storage_formats/pandas/query_compiler.py | 64 +++++++++++++++++-- modin/pandas/base.py | 17 +++-- modin/pandas/dataframe.py | 4 +- modin/pandas/general.py | 2 +- modin/pandas/groupby.py | 2 +- modin/tests/pandas/test_groupby.py | 4 +- 7 files changed, 111 insertions(+), 27 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 80e89a577a2..343008d2a3d 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -120,10 +120,21 @@ class BaseQueryCompiler( Attributes ---------- - lazy_execution : bool - Whether underlying execution engine is designed to be executed in a lazy mode only. - If True, such QueryCompiler will be handled differently at the front-end in order - to reduce execution triggering as much as possible. + lazy_row_labels : bool, default False + True if the backend defers computations of the row labels (`df.index` for a frame). + Used by the frontend to avoid unnecessary execution or defer error validation. + lazy_row_count : bool, default False + True if the backend defers computations of the number of rows (`len(df.index)`). + Used by the frontend to avoid unnecessary execution or defer error validation. + lazy_column_types : bool, default False + True if the backend defers computations of the column types (`df.dtypes`). + Used by the frontend to avoid unnecessary execution or defer error validation. + lazy_column_labels : bool, default False + True if the backend defers computations of the column labels (`df.columns`). + Used by the frontend to avoid unnecessary execution or defer error validation. + lazy_column_count : bool, default False + True if the backend defers computations of the number of columns (`len(df.columns)`). + Used by the frontend to avoid unnecessary execution or defer error validation. _shape_hint : {"row", "column", None}, default: None Shape hint for frames known to be a column or a row, otherwise None. @@ -197,7 +208,25 @@ def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self: # some of these abstract methods, but for the sake of generality they are # treated differently. - lazy_execution = False + lazy_row_labels = False + lazy_row_count = False + lazy_column_types = False + lazy_column_labels = False + lazy_column_count = False + + @property + def lazy_shape(self): + """ + Whether either of the underlying dataframe's dimensions (row count/column count) are computed lazily. + + If True, the frontend should avoid length/shape checks as much as possible. + + Returns + ------- + bool + """ + return self.lazy_row_count or self.lazy_column_count + _shape_hint = None # Metadata modification abstract methods @@ -4524,7 +4553,7 @@ def has_multiindex(self, axis=0): @property def frame_has_materialized_dtypes(self) -> bool: """ - Check if the undelying dataframe has materialized dtypes. + Check if the underlying dataframe has materialized dtypes. Returns ------- @@ -4535,7 +4564,7 @@ def frame_has_materialized_dtypes(self) -> bool: @property def frame_has_materialized_columns(self) -> bool: """ - Check if the undelying dataframe has materialized columns. + Check if the underlying dataframe has materialized columns. Returns ------- @@ -4546,7 +4575,7 @@ def frame_has_materialized_columns(self) -> bool: @property def frame_has_materialized_index(self) -> bool: """ - Check if the undelying dataframe has materialized index. + Check if the underlying dataframe has materialized index. Returns ------- diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 9d4467c2085..3581516a638 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -276,19 +276,69 @@ def __init__(self, modin_frame: PandasDataframe, shape_hint: Optional[str] = Non self._shape_hint = shape_hint @property - def lazy_execution(self): + def lazy_row_labels(self): """ - Whether underlying Modin frame should be executed in a lazy mode. + Whether the row labels are computed lazily. - If True, such QueryCompiler will be handled differently at the front-end in order - to reduce triggering the computation as much as possible. + Equivalent to `not self.frame_has_materialized_index`. Returns ------- bool """ - frame = self._modin_frame - return not frame.has_materialized_index or not frame.has_materialized_columns + return not self.frame_has_materialized_index + + @property + def lazy_row_count(self): + """ + Whether the row count is computed lazily. + + Equivalent to `not self.frame_has_materialized_index`. + + Returns + ------- + bool + """ + return not self.frame_has_materialized_index + + @property + def lazy_column_types(self): + """ + Whether the dtypes are computed lazily. + + Equivalent to `not self.frame_has_materialized_dtypes`. + + Returns + ------- + bool + """ + return not self.frame_has_materialized_dtypes + + @property + def lazy_column_labels(self): + """ + Whether the column labels are computed lazily. + + Equivalent to `not self.frame_has_materialized_columns`. + + Returns + ------- + bool + """ + return not self.frame_has_materialized_columns + + @property + def lazy_column_count(self): + """ + Whether the column count is are computed lazily. + + Equivalent to `not self.frame_has_materialized_columns`. + + Returns + ------- + bool + """ + return not self.frame_has_materialized_columns def finalize(self): self._modin_frame.finalize() @@ -607,7 +657,7 @@ def reindex(self, axis, labels, **kwargs): return self.__constructor__(new_modin_frame) def reset_index(self, **kwargs) -> PandasQueryCompiler: - if self.lazy_execution: + if self.lazy_row_labels: def _reset(df, *axis_lengths, partition_idx): # pragma: no cover df = df.reset_index(**kwargs) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index b930f1bc7c0..51c9cd8156f 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1068,7 +1068,7 @@ def astype( # will handle errors where dtype dict includes keys that are not # in columns. if ( - not self._query_compiler.lazy_execution + not self._query_compiler.lazy_column_labels and not set(dtype.keys()).issubset(set(self._query_compiler.columns)) and errors == "raise" ): @@ -1462,7 +1462,9 @@ def drop( axes[axis] = [axes[axis]] # In case of lazy execution we should bypass these error checking components # because they can force the materialization of the row or column labels. - if self._query_compiler.lazy_execution: + if (axis == "index" and self._query_compiler.lazy_row_labels) or ( + axis == "columns" and self._query_compiler.lazy_column_labels + ): continue if errors == "raise": non_existent = pandas.Index(axes[axis]).difference( @@ -2657,7 +2659,10 @@ def reset_index( # exist. if ( not drop - and not self._query_compiler.lazy_execution + and not ( + self._query_compiler.lazy_column_labels + or self._query_compiler.lazy_row_labels + ) and not self._query_compiler.has_multiindex() and all(n in self.columns for n in ["level_0", "index"]) ): @@ -3944,7 +3949,7 @@ def __getitem__(self, key) -> Self: BasePandasDataset Located dataset. """ - if not self._query_compiler.lazy_execution and len(self) == 0: + if not self._query_compiler.lazy_row_count and len(self) == 0: return self._default_to_pandas("__getitem__", key) # see if we can slice the rows # This lets us reuse code in pandas to error check @@ -4075,7 +4080,7 @@ def _getitem_slice(self, key: slice) -> Self: if is_full_grab_slice( key, # Avoid triggering shape computation for lazy executions - sequence_len=(None if self._query_compiler.lazy_execution else len(self)), + sequence_len=(None if self._query_compiler.lazy_row_count else len(self)), ): return self.copy() return self.iloc[key] @@ -4301,7 +4306,7 @@ def __getattribute__(self, item) -> Any: Any """ attr = super().__getattribute__(item) - if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution: + if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape: # We default to pandas on empty DataFrames. This avoids a large amount of # pain in underlying implementation and returns a result immediately rather # than dealing with the edge cases that empty DataFrames have. diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 8a80809dd3e..3d97efb4af4 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1084,7 +1084,7 @@ def insert( + f"{len(value.columns)} columns instead." ) value = value.squeeze(axis=1) - if not self._query_compiler.lazy_execution and len(self.index) == 0: + if not self._query_compiler.lazy_row_count and len(self.index) == 0: if not hasattr(value, "index"): try: value = pandas.Series(value) @@ -2783,7 +2783,7 @@ def setitem_unhashable_key(df, value): if not isinstance(value, (Series, Categorical, np.ndarray, list, range)): value = list(value) - if not self._query_compiler.lazy_execution and len(self.index) == 0: + if not self._query_compiler.lazy_row_count and len(self.index) == 0: new_self = self.__constructor__({key: value}, columns=self.columns) self._update_inplace(new_self._query_compiler) else: diff --git a/modin/pandas/general.py b/modin/pandas/general.py index aeff9986f35..92aa195eae4 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -491,7 +491,7 @@ def concat( for obj in list_of_objs if ( isinstance(obj, (Series, pandas.Series)) - or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_execution) + or (isinstance(obj, DataFrame) and obj._query_compiler.lazy_shape) or sum(obj.shape) > 0 ) ] diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 882ae19f6d7..080424a1761 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -196,7 +196,7 @@ def __getattr__(self, key): def __getattribute__(self, item): attr = super().__getattribute__(item) - if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution: + if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_shape: # We default to pandas on empty DataFrames. This avoids a large amount of # pain in underlying implementation and returns a result immediately rather # than dealing with the edge cases that empty DataFrames have. diff --git a/modin/tests/pandas/test_groupby.py b/modin/tests/pandas/test_groupby.py index cf5d06e10e1..b82473c674b 100644 --- a/modin/tests/pandas/test_groupby.py +++ b/modin/tests/pandas/test_groupby.py @@ -2752,7 +2752,7 @@ def lazy_frame(self): donor_obj = pd.DataFrame()._query_compiler self._mock_obj = mock.patch( - f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_execution", + f"{donor_obj.__module__}.{donor_obj.__class__.__name__}.lazy_shape", new_callable=mock.PropertyMock, ) patch_obj = self._mock_obj.__enter__() @@ -2760,7 +2760,7 @@ def lazy_frame(self): df = pd.DataFrame(**self._df_kwargs) # The frame is lazy until `self.__exit__()` is called - assert df._query_compiler.lazy_execution + assert df._query_compiler.lazy_shape return df def __enter__(self): From 621f49e0875178d7f099d26431ff60bd5103e5f0 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 31 Jul 2024 06:14:04 -0500 Subject: [PATCH 04/20] FIX-#7351: Add ipython method calls to non-lookup list (#7352) Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 51c9cd8156f..04dd845915c 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -99,7 +99,13 @@ # Do not lookup certain attributes in columns or index, as they're used for some # special purposes, like serving remote context -_ATTRS_NO_LOOKUP = {"__name__", "_cache"} +_ATTRS_NO_LOOKUP = { + "__name__", + "_cache", + "_ipython_canary_method_should_not_exist_", + "_ipython_display_", + "_repr_mimebundle_", +} _DEFAULT_BEHAVIOUR = { "__init__", From 24018dbf9c3932e201c4b49869281eff7960bddf Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Wed, 31 Jul 2024 04:40:55 -0700 Subject: [PATCH 05/20] FIX-#7134: Use a separate docstring class for BasePandasDataset. (#7353) Signed-off-by: sfc-gh-mvashishtha --- modin/tests/config/docs_module/__init__.py | 4 ++-- modin/tests/config/docs_module/classes.py | 8 ++++++++ modin/tests/config/test_envvars.py | 8 ++++++++ modin/utils.py | 13 ++++++++++++- 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/modin/tests/config/docs_module/__init__.py b/modin/tests/config/docs_module/__init__.py index aa21549f1bd..5f617d683ba 100644 --- a/modin/tests/config/docs_module/__init__.py +++ b/modin/tests/config/docs_module/__init__.py @@ -11,7 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -from .classes import DataFrame, Series +from .classes import BasePandasDataset, DataFrame, Series from .functions import read_csv -__all__ = ["DataFrame", "Series", "read_csv"] +__all__ = ["BasePandasDataset", "DataFrame", "Series", "read_csv"] diff --git a/modin/tests/config/docs_module/classes.py b/modin/tests/config/docs_module/classes.py index 8dc152e23cd..235c99bdf0f 100644 --- a/modin/tests/config/docs_module/classes.py +++ b/modin/tests/config/docs_module/classes.py @@ -22,3 +22,11 @@ class Series: def isna(self): """This is a test of the documentation module for Series.""" return + + +class BasePandasDataset: + """This is a test of the documentation module for BasePandasDataSet.""" + + def apply(): + """This is a test of the documentation module for BasePandasDataSet.apply.""" + return diff --git a/modin/tests/config/test_envvars.py b/modin/tests/config/test_envvars.py index 384bd5f199a..d057ecb0299 100644 --- a/modin/tests/config/test_envvars.py +++ b/modin/tests/config/test_envvars.py @@ -20,6 +20,7 @@ import modin.pandas as pd from modin.config.envvars import _check_vars from modin.config.pubsub import _UNSET, ExactStr +from modin.pandas.base import BasePandasDataset def reset_vars(*vars: tuple[cfg.Parameter]): @@ -89,6 +90,12 @@ def test_overrides(self): cfg.DocModule.put("modin.tests.config.docs_module") # Test for override + assert BasePandasDataset.__doc__ == ( + "This is a test of the documentation module for BasePandasDataSet." + ) + assert BasePandasDataset.apply.__doc__ == ( + "This is a test of the documentation module for BasePandasDataSet.apply." + ) assert ( pd.DataFrame.apply.__doc__ == "This is a test of the documentation module for DataFrame." @@ -96,6 +103,7 @@ def test_overrides(self): # Test for pandas doc when method is not defined on the plugin module assert pandas.DataFrame.isna.__doc__ in pd.DataFrame.isna.__doc__ assert pandas.DataFrame.isnull.__doc__ in pd.DataFrame.isnull.__doc__ + assert BasePandasDataset.astype.__doc__ in pd.DataFrame.astype.__doc__ # Test for override assert ( pd.Series.isna.__doc__ diff --git a/modin/utils.py b/modin/utils.py index 34071be132b..8623732671b 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -462,7 +462,18 @@ def _inherit_docstrings_in_place( if doc_module != DocModule.default and "pandas" in str( getattr(parent, "__module__", "") ): - parent = getattr(imported_doc_module, getattr(parent, "__name__", ""), parent) + parent_name = ( + # DocModule should use the class BasePandasDataset to override the + # docstrings of BasePandasDataset, even if BasePandasDataset + # normally inherits docstrings from a different `parent`. + "BasePandasDataset" + if getattr(cls_or_func, "__name__", "") == "BasePandasDataset" + # For other classes, override docstrings with the class that has the + # same name as the `parent` class, e.g. DataFrame inherits + # docstrings from doc_module.DataFrame. + else getattr(parent, "__name__", "") + ) + parent = getattr(imported_doc_module, parent_name, parent) if parent != default_parent: # Reset API link in case the docs are overridden. apilink = None From 22ed4d87b7f7b6b9e068f65294aeb6311ec08bd2 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Thu, 1 Aug 2024 07:06:55 -0700 Subject: [PATCH 06/20] FIX-#7113: Fix docstring overrides for subclasses. (#7354) Signed-off-by: sfc-gh-mvashishtha --- modin/tests/config/docs_module/classes.py | 3 +++ .../docs_module_with_just_base/__init__.py | 16 ++++++++++++++++ .../docs_module_with_just_base/classes.py | 17 +++++++++++++++++ modin/tests/config/test_envvars.py | 18 ++++++++++++++++++ modin/utils.py | 15 +++++++++++++-- 5 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 modin/tests/config/docs_module_with_just_base/__init__.py create mode 100644 modin/tests/config/docs_module_with_just_base/classes.py diff --git a/modin/tests/config/docs_module/classes.py b/modin/tests/config/docs_module/classes.py index 235c99bdf0f..9a8eabac61b 100644 --- a/modin/tests/config/docs_module/classes.py +++ b/modin/tests/config/docs_module/classes.py @@ -30,3 +30,6 @@ class BasePandasDataset: def apply(): """This is a test of the documentation module for BasePandasDataSet.apply.""" return + + def astype(): + """This is a test of the documentation module for BasePandasDataSet.astype.""" diff --git a/modin/tests/config/docs_module_with_just_base/__init__.py b/modin/tests/config/docs_module_with_just_base/__init__.py new file mode 100644 index 00000000000..f2da948e26c --- /dev/null +++ b/modin/tests/config/docs_module_with_just_base/__init__.py @@ -0,0 +1,16 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from .classes import BasePandasDataset + +__all__ = ["BasePandasDataset"] diff --git a/modin/tests/config/docs_module_with_just_base/classes.py b/modin/tests/config/docs_module_with_just_base/classes.py new file mode 100644 index 00000000000..645c7c63df6 --- /dev/null +++ b/modin/tests/config/docs_module_with_just_base/classes.py @@ -0,0 +1,17 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +class BasePandasDataset: + def astype(): + """This is a test of the documentation module for BasePandasDataSet.astype.""" diff --git a/modin/tests/config/test_envvars.py b/modin/tests/config/test_envvars.py index d057ecb0299..17ac0b58786 100644 --- a/modin/tests/config/test_envvars.py +++ b/modin/tests/config/test_envvars.py @@ -12,6 +12,7 @@ # governing permissions and limitations under the License. import os +import sys import pandas import pytest @@ -96,6 +97,13 @@ def test_overrides(self): assert BasePandasDataset.apply.__doc__ == ( "This is a test of the documentation module for BasePandasDataSet.apply." ) + # Test scenario 2 from https://github.com/modin-project/modin/issues/7113: + # We can correctly override the docstring for BasePandasDataset.astype, + # which is the same method as Series.astype. + assert pd.Series.astype is BasePandasDataset.astype + assert BasePandasDataset.astype.__doc__ == ( + "This is a test of the documentation module for BasePandasDataSet.astype." + ) assert ( pd.DataFrame.apply.__doc__ == "This is a test of the documentation module for DataFrame." @@ -130,6 +138,16 @@ def test_not_redefining_classes_modin_issue_7138(self): assert pd.DataFrame is original_dataframe_class + def test_base_docstring_override_with_no_dataframe_or_series_class_issue_7113( + self, + ): + # This test case tests scenario 1 from issue 7113. + sys.path.append(f"{os.path.dirname(__file__)}") + cfg.DocModule.put("docs_module_with_just_base") + assert BasePandasDataset.astype.__doc__ == ( + "This is a test of the documentation module for BasePandasDataSet.astype." + ) + @pytest.mark.skipif(cfg.Engine.get() != "Ray", reason="Ray specific test") def test_ray_cluster_resources(): diff --git a/modin/utils.py b/modin/utils.py index 8623732671b..6c17b1b12d3 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -384,6 +384,13 @@ def _replace_doc( # inherited docstrings. _docstring_inheritance_calls: list[Callable[[str], None]] = [] +# This is a set of (class, attribute_name) pairs whose docstrings we have +# already replaced since we last updated DocModule. Note that we don't store +# the attributes themselves since we replace property attributes instead of +# modifying them in place: +# https://github.com/modin-project/modin/blob/e9dbcc127913db77473a83936e8b6bb94ef84f0d/modin/utils.py#L353 +_attributes_with_docstrings_replaced: set[tuple[type, str]] = set() + def _documentable_obj(obj: object) -> bool: """ @@ -417,6 +424,7 @@ def _update_inherited_docstrings(doc_module: DocModule) -> None: doc_module : DocModule The current DocModule. """ + _attributes_with_docstrings_replaced.clear() _doc_module = doc_module.get() for doc_inheritance_call in _docstring_inheritance_calls: doc_inheritance_call(doc_module=_doc_module) # type: ignore[call-arg] @@ -488,7 +496,8 @@ def _inherit_docstrings_in_place( if base is object: continue for attr, obj in base.__dict__.items(): - if attr in seen: + # only replace docstrings once to prevent https://github.com/modin-project/modin/issues/7113 + if attr in seen or (base, attr) in _attributes_with_docstrings_replaced: continue seen.add(attr) # Try to get the attribute from the docs class first, then @@ -507,10 +516,12 @@ def _inherit_docstrings_in_place( obj, overwrite_existing, apilink, - parent_cls=cls_or_func, + parent_cls=base, attr_name=attr, ) + _attributes_with_docstrings_replaced.add((base, attr)) + def _inherit_docstrings( parent: object, From 6dce30e5538b7de3ed291e0741f5a63b7481bd2d Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Thu, 1 Aug 2024 18:33:11 +0200 Subject: [PATCH 07/20] FIX-#7355: Cpu count would be set incorrectly on a cluster (#7356) Signed-off-by: arunjose696 --- modin/config/envvars.py | 18 ++++++++++++++++++ modin/core/execution/dask/common/utils.py | 1 + modin/core/execution/ray/common/utils.py | 1 + modin/core/execution/unidist/common/utils.py | 1 + 4 files changed, 21 insertions(+) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 8654ebe30c1..3635c63d026 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -332,6 +332,24 @@ class CpuCount(EnvironmentVariable, type=int): varname = "MODIN_CPUS" + @classmethod + def _put(cls, value: int) -> None: + """ + Put specific value if CpuCount wasn't set by a user yet. + + Parameters + ---------- + value : int + Config value to set. + + Notes + ----- + This method is used to set CpuCount from cluster resources internally + and should not be called by a user. + """ + if cls.get_value_source() == ValueSource.DEFAULT: + cls.put(value) + @classmethod def _get_default(cls) -> int: """ diff --git a/modin/core/execution/dask/common/utils.py b/modin/core/execution/dask/common/utils.py index 067a94fcdf0..52b4e38f53d 100644 --- a/modin/core/execution/dask/common/utils.py +++ b/modin/core/execution/dask/common/utils.py @@ -74,3 +74,4 @@ def _disable_warnings(): num_cpus = len(client.ncores()) NPartitions._put(num_cpus) + CpuCount._put(num_cpus) diff --git a/modin/core/execution/ray/common/utils.py b/modin/core/execution/ray/common/utils.py index cc2010fc7fb..d419a61a0d2 100644 --- a/modin/core/execution/ray/common/utils.py +++ b/modin/core/execution/ray/common/utils.py @@ -151,6 +151,7 @@ def initialize_ray( num_cpus = int(ray.cluster_resources()["CPU"]) NPartitions._put(num_cpus) + CpuCount._put(num_cpus) # TODO(https://github.com/ray-project/ray/issues/28216): remove this # workaround once Ray gives a better way to suppress task errors. diff --git a/modin/core/execution/unidist/common/utils.py b/modin/core/execution/unidist/common/utils.py index 5aa31698b6a..6455d194b25 100644 --- a/modin/core/execution/unidist/common/utils.py +++ b/modin/core/execution/unidist/common/utils.py @@ -42,6 +42,7 @@ def initialize_unidist(): num_cpus = sum(v["CPU"] for v in unidist.cluster_resources().values()) modin_cfg.NPartitions._put(num_cpus) + modin_cfg.CpuCount._put(num_cpus) def deserialize(obj): # pragma: no cover From b236b76ece7bc917485bfc35aa2c89006213f1f1 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 2 Aug 2024 03:27:12 -0500 Subject: [PATCH 08/20] FIX-#7357: Fix `NoAttributeError` on `DataFrame.copy` (#7358) Signed-off-by: Devin Petersohn --- modin/polars/dataframe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/modin/polars/dataframe.py b/modin/polars/dataframe.py index d4408ff39f0..021daea692a 100644 --- a/modin/polars/dataframe.py +++ b/modin/polars/dataframe.py @@ -104,6 +104,9 @@ def __getitem__(self, item): return Series(_query_compiler=self._query_compiler.getitem_array([item])) + def _copy(self): + return self.__constructor__(_query_compiler=self._query_compiler.copy()) + def _to_polars(self) -> polars.DataFrame: """ Convert the DataFrame to Polars format. @@ -266,7 +269,7 @@ def _convert_non_numeric_to_null(self): need_columns_reindex=False, ).astype({c: self._query_compiler.dtypes[c] for c in non_numeric_cols}) ) - return self.copy() + return self._copy() def mean(self, *, axis=None, null_strategy="ignore"): """ @@ -956,7 +959,7 @@ def rechunk(self) -> "DataFrame": Returns: Rechunked DataFrame. """ - return self.copy() + return self._copy() def rename(self, mapping: dict[str, str] | callable) -> "DataFrame": """ @@ -972,7 +975,7 @@ def rename(self, mapping: dict[str, str] | callable) -> "DataFrame": mapping = {c: mapping(c) for c in self.columns} # TODO: add a query compiler method for `rename` new_columns = {c: mapping.get(c, c) for c in self.columns} - new_obj = self.copy() + new_obj = self._copy() new_obj.columns = new_columns return new_obj @@ -1076,7 +1079,7 @@ def set_sorted( if isinstance(column, str): column = [column] new_sorted_columns = [c in column for c in self.columns] - obj = self.copy() + obj = self._copy() obj._sorted_columns = new_sorted_columns return obj @@ -1231,7 +1234,7 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> "DataFrame": DataFrame with the row index added. """ if offset != 0: - obj = self.copy() + obj = self._copy() obj.index = obj.index + offset result = self.__constructor__( _query_compiler=self._query_compiler.reset_index(drop=False) From 05e5c484d476c920bf5ca03633b0fb0bcf2cd0e7 Mon Sep 17 00:00:00 2001 From: Kirill Suvorov Date: Mon, 19 Aug 2024 13:56:21 +0200 Subject: [PATCH 09/20] FEAT-#7368: Add a new environment variable for using dynamic partitioning (#7369) Signed-off-by: Kirill Suvorov Co-authored-by: Anatoly Myachev --- docs/usage_guide/optimization_notes/index.rst | 33 +++++++++++++++++++ modin/config/__init__.py | 2 ++ modin/config/envvars.py | 12 +++++++ .../pandas/partitioning/partition_manager.py | 3 +- .../storage_formats/pandas/test_internals.py | 7 ++-- 5 files changed, 53 insertions(+), 4 deletions(-) diff --git a/docs/usage_guide/optimization_notes/index.rst b/docs/usage_guide/optimization_notes/index.rst index aadd813e318..0dcbe5a25d7 100644 --- a/docs/usage_guide/optimization_notes/index.rst +++ b/docs/usage_guide/optimization_notes/index.rst @@ -37,6 +37,38 @@ Range-partitioning is not a silver bullet, meaning that enabling it is not alway a link to the list of operations that have support for range-partitioning and practical advices on when one should enable it: :doc:`operations that support range-partitioning `. +Dynamic-partitioning in Modin +""""""""""""""""""""""""""""" + +Ray engine experiences slowdowns when running a large number of small remote tasks at the same time. Ray Core recommends to `avoid tiny task`_. +When modin DataFrame has a large number of partitions, some functions produce a large number of remote tasks, which can cause slowdowns. +To solve this problem, Modin suggests using dynamic partitioning. This approach reduces the number of remote tasks +by combining multiple partitions into a single virtual partition and perform a common remote task on them. + +Dynamic partitioning is typically used for operations that are fully or partially executed on all partitions separately. + +.. code-block:: python + + import modin.pandas as pd + from modin.config import context + + df = pd.DataFrame(...) + + with context(DynamicPartitioning=True): + df.abs() + +Dynamic partitioning is also not always useful, and this approach is usually used for medium-sized DataFrames with a large number of columns. +If the number of columns is small, the number of partitions will be close to the number of CPUs, and Ray will not have this problem. +If the DataFrame has too many rows, this is also not a good case for using Dynamic-partitioning, since each task is no longer tiny and performing +the combined tasks carries more overhead than assigning them separately. + +Unfortunately, the use of Dynamic-partitioning depends on various factors such as data size, number of CPUs, operations performed, +and it is up to the user to determine whether Dynamic-partitioning will give a boost in his case or not. + +.. + TODO: Define heuristics to automatically enable dynamic partitioning without performance penalty. + `Issue #7370 `_ + Understanding Modin's partitioning mechanism """""""""""""""""""""""""""""""""""""""""""" @@ -311,3 +343,4 @@ an inner join you may want to swap left and right DataFrames. Note that result columns order may differ for first and second ``merge``. .. _range-partitioning: https://www.techopedia.com/definition/31994/range-partitioning +.. _`avoid tiny task`: https://docs.ray.io/en/latest/ray-core/tips-for-first-time.html#tip-2-avoid-tiny-tasks diff --git a/modin/config/__init__.py b/modin/config/__init__.py index cf5f7895c5d..60806a79231 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -23,6 +23,7 @@ CpuCount, DaskThreadsPerWorker, DocModule, + DynamicPartitioning, Engine, EnvironmentVariable, GithubCI, @@ -95,6 +96,7 @@ "AsyncReadMode", "ReadSqlEngine", "IsExperimental", + "DynamicPartitioning", # For tests "TrackFileLeaks", "TestReadFromSqlServer", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 3635c63d026..60d82e4a22c 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -892,6 +892,18 @@ class DaskThreadsPerWorker(EnvironmentVariable, type=int): default = 1 +class DynamicPartitioning(EnvironmentVariable, type=bool): + """ + Set to true to use Modin's dynamic-partitioning implementation where possible. + + Please refer to documentation for cases where enabling this options would be beneficial: + https://modin.readthedocs.io/en/stable/usage_guide/optimization_notes/index.html#dynamic-partitioning-in-modin + """ + + varname = "MODIN_DYNAMIC_PARTITIONING" + default = False + + def _check_vars() -> None: """ Check validity of environment variables. diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 8b4f6788931..05854239206 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -30,6 +30,7 @@ from modin.config import ( BenchmarkMode, CpuCount, + DynamicPartitioning, Engine, MinColumnPartitionSize, MinRowPartitionSize, @@ -675,7 +676,7 @@ def map_partitions( NumPy array An array of partitions """ - if np.prod(partitions.shape) <= 1.5 * CpuCount.get(): + if not DynamicPartitioning.get(): # block-wise map new_partitions = cls.base_map_partitions( partitions, map_func, func_args, func_kwargs diff --git a/modin/tests/core/storage_formats/pandas/test_internals.py b/modin/tests/core/storage_formats/pandas/test_internals.py index 4113f3ce0ed..b030fe7b216 100644 --- a/modin/tests/core/storage_formats/pandas/test_internals.py +++ b/modin/tests/core/storage_formats/pandas/test_internals.py @@ -2658,7 +2658,7 @@ def remote_func(): ), ], ) -def test_map_approaches(partitioning_scheme, expected_map_approach): +def test_dynamic_partitioning(partitioning_scheme, expected_map_approach): data_size = MinRowPartitionSize.get() * CpuCount.get() data = {f"col{i}": np.ones(data_size) for i in range(data_size)} df = pandas.DataFrame(data) @@ -2672,8 +2672,9 @@ def test_map_approaches(partitioning_scheme, expected_map_approach): expected_map_approach, wraps=getattr(partition_mgr_cls, expected_map_approach), ) as expected_method: - partition_mgr_cls.map_partitions(partitions, lambda x: x * 2) - expected_method.assert_called() + with context(DynamicPartitioning=True): + partition_mgr_cls.map_partitions(partitions, lambda x: x * 2) + expected_method.assert_called() def test_map_partitions_joined_by_column(): From 8fc230a0a624eb61389859eb4fbf55fc0a2bda0c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 26 Aug 2024 13:34:37 +0200 Subject: [PATCH 10/20] FIX-#7373: Try a previous version of `motoserver/moto` service, pin to 5.0.13 (#7374) Signed-off-by: Anatoly Myachev --- .github/workflows/ci.yml | 8 ++++---- .github/workflows/push-to-main.yml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f82d4ca7f9..10b17d0e4a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -293,7 +293,7 @@ jobs: name: test-ubuntu (engine unidist ${{matrix.unidist-backend}}, python ${{matrix.python-version}}) services: moto: - image: motoserver/moto + image: motoserver/moto:5.0.13 ports: - 5000:5000 env: @@ -382,7 +382,7 @@ jobs: # Using workaround https://github.com/actions/runner/issues/822#issuecomment-1524826092 moto: # we only need moto service on Ubuntu and for group_4 task or python engine - image: ${{ (matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')) && 'motoserver/moto' || '' }} + image: ${{ (matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')) && 'motoserver/moto:5.0.13' || '' }} ports: - 5000:5000 env: @@ -508,7 +508,7 @@ jobs: name: test-${{ matrix.os }}-sanity (engine ${{ matrix.execution.name }}, python ${{matrix.python-version}}) services: moto: - image: ${{ matrix.os != 'windows' && 'motoserver/moto' || '' }} + image: ${{ matrix.os != 'windows' && 'motoserver/moto:5.0.13' || '' }} ports: - 5000:5000 env: @@ -623,7 +623,7 @@ jobs: name: test experimental services: moto: - image: motoserver/moto + image: motoserver/moto:5.0.13 ports: - 5000:5000 env: diff --git a/.github/workflows/push-to-main.yml b/.github/workflows/push-to-main.yml index f8f05541ce1..dc0e83e102c 100644 --- a/.github/workflows/push-to-main.yml +++ b/.github/workflows/push-to-main.yml @@ -19,7 +19,7 @@ jobs: shell: bash -l {0} services: moto: - image: motoserver/moto + image: motoserver/moto:5.0.13 ports: - 5000:5000 env: From da015711d94787e044624a08b2660377eacab30f Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:26:11 +0200 Subject: [PATCH 11/20] FEAT-#4605: Add native query compiler (#7259) Co-authored-by: Igoshev, Iaroslav Signed-off-by: arunjose696 --- .github/workflows/ci.yml | 35 + modin/config/__init__.py | 2 + modin/config/envvars.py | 22 + .../dispatching/factories/factories.py | 5 + .../pandas/native_query_compiler.py | 1285 +++++++++++++++++ modin/tests/pandas/dataframe/test_binary.py | 6 +- modin/tests/pandas/dataframe/test_default.py | 25 +- modin/tests/pandas/dataframe/test_indexing.py | 10 +- modin/tests/pandas/dataframe/test_iter.py | 8 +- .../tests/pandas/dataframe/test_join_sort.py | 9 +- .../pandas/dataframe/test_map_metadata.py | 22 +- modin/tests/pandas/dataframe/test_pickle.py | 1 - modin/tests/test_utils.py | 18 +- 13 files changed, 1427 insertions(+), 21 deletions(-) create mode 100644 modin/core/storage_formats/pandas/native_query_compiler.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 10b17d0e4a6..9186500682a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -246,12 +246,16 @@ jobs: unidist: ${{ steps.filter.outputs.unidist }} engines: ${{ steps.engines.outputs.engines }} experimental: ${{ steps.experimental.outputs.experimental }} + test-native-dataframe-mode: ${{ steps.filter.outputs.test-native-dataframe-mode }} steps: - uses: actions/checkout@v4 - uses: dorny/paths-filter@v3 id: filter with: filters: | + test-native-dataframe-mode: + - 'modin/core/storage_formats/pandas/native_query_compiler.py' + - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared - 'modin/core/execution/dispatching/**' ray: @@ -665,6 +669,37 @@ jobs: python-version: ${{matrix.python-version}} - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py + test-native-dataframe-mode: + needs: [ lint-flake8, execution-filter] + if: ${{ needs.execution-filter.outputs.test-native-dataframe-mode == 'true' }} + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.9"] + env: + MODIN_NATIVE_DATAFRAME_MODE: "Pandas" + name: test-native-dataframe-mode python ${{matrix.python-version}}) + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/mamba-env + with: + environment-file: environment-dev.yml + python-version: ${{matrix.python-version}} + - run: python -m pytest modin/tests/pandas/dataframe/test_binary.py + - run: python -m pytest modin/tests/pandas/dataframe/test_default.py + - run: python -m pytest modin/tests/pandas/dataframe/test_indexing.py + - run: python -m pytest modin/tests/pandas/dataframe/test_iter.py + - run: python -m pytest modin/tests/pandas/dataframe/test_join_sort.py + - run: python -m pytest modin/tests/pandas/dataframe/test_map_metadata.py + - run: python -m pytest modin/tests/pandas/dataframe/test_pickle.py + - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py + - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py + - run: python -m pytest modin/tests/pandas/dataframe/test_window.py + - uses: ./.github/actions/upload-coverage + merge-coverage-artifacts: needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity] if: always() # we need to run it regardless of some job being skipped, like in PR diff --git a/modin/config/__init__.py b/modin/config/__init__.py index 60806a79231..d38596eff5c 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -40,6 +40,7 @@ MinPartitionSize, MinRowPartitionSize, ModinNumpy, + NativeDataframeMode, NPartitions, PersistentPickle, ProgressBar, @@ -69,6 +70,7 @@ "CpuCount", "GpuCount", "Memory", + "NativeDataframeMode", # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 60d82e4a22c..97ed1579667 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -943,4 +943,26 @@ def _check_vars() -> None: ) +class NativeDataframeMode(EnvironmentVariable, type=str): + """ + Configures the query compiler to process Modin data. + + When this config is set to ``Default``, ``PandasQueryCompiler`` is used, + which leads to Modin executing dataframes in distributed fashion. + When set to a string (e.g., ``pandas``), ``NativeQueryCompiler`` is used, + which handles the dataframes without distributing, + falling back to native library functions (e.g., ``pandas``). + + This could be beneficial for handling relatively small dataframes + without involving additional overhead of communication between processes. + """ + + varname = "MODIN_NATIVE_DATAFRAME_MODE" + choices = ( + "Default", + "Pandas", + ) + default = "Default" + + _check_vars() diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index e7f2493e404..deda5113287 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -26,7 +26,9 @@ import pandas from pandas.util._decorators import doc +from modin.config import NativeDataframeMode from modin.core.io import BaseIO +from modin.core.storage_formats.pandas.native_query_compiler import NativeQueryCompiler from modin.utils import get_current_execution _doc_abstract_factory_class = """ @@ -168,6 +170,9 @@ def prepare(cls): method="io.from_pandas", ) def _from_pandas(cls, df): + if NativeDataframeMode.get() == "Pandas": + df_copy = df.copy() + return NativeQueryCompiler(df_copy) return cls.io_cls.from_pandas(df) @classmethod diff --git a/modin/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py new file mode 100644 index 00000000000..bfe331cfc6e --- /dev/null +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -0,0 +1,1285 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +Module contains ``NativeQueryCompiler`` class. + +``NativeQueryCompiler`` is responsible for compiling efficient DataFrame algebra +queries for small data and empty ``PandasDataFrame``. +""" + +from typing import Optional + +import numpy as np +import pandas +from pandas.core.dtypes.common import is_list_like, is_scalar + +from modin.config.envvars import NativeDataframeMode +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.utils import ( + MODIN_UNNAMED_SERIES_LABEL, + _inherit_docstrings, + try_cast_to_pandas, +) + + +def _get_axis(axis): + """ + Build index labels getter of the specified axis. + + Parameters + ---------- + axis : {0, 1} + Axis to get labels from. 0 is for index and 1 is for column. + + Returns + ------- + callable(NativeQueryCompiler) -> pandas.Index + """ + if axis == 0: + return lambda self: self._modin_frame.index + else: + return lambda self: self._modin_frame.columns + + +def _set_axis(axis): + """ + Build index labels setter of the specified axis. + + Parameters + ---------- + axis : {0, 1} + Axis to set labels on. 0 is for index and 1 is for column. + + Returns + ------- + callable(NativeQueryCompiler) + """ + if axis == 0: + + def set_axis(self, idx): + self._modin_frame.index = idx + + else: + + def set_axis(self, cols): + self._modin_frame.columns = cols + + return set_axis + + +def _str_map(func_name): + """ + Build function that calls specified string function on frames ``str`` accessor. + + Parameters + ---------- + func_name : str + String function name to execute on ``str`` accessor. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + """ + + def str_op_builder(df, *args, **kwargs): + """Apply specified function against `str` accessor of the passed frame.""" + str_s = df.squeeze(axis=1).str + return getattr(pandas.Series.str, func_name)(str_s, *args, **kwargs).to_frame() + + return str_op_builder + + +def _dt_prop_map(property_name): + """ + Build function that access specified property of the ``dt`` property of the passed frame. + + Parameters + ---------- + property_name : str + Date-time property name to access. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied in the frame. + + Notes + ----- + This applies non-callable properties of ``Series.dt``. + """ + + def dt_op_builder(df, *args, **kwargs): + """Access specified date-time property of the passed frame.""" + prop_val = getattr(df.squeeze(axis=1).dt, property_name) + if isinstance(prop_val, pandas.Series): + return prop_val.to_frame() + elif isinstance(prop_val, pandas.DataFrame): + return prop_val + else: + return pandas.DataFrame([prop_val]) + + return dt_op_builder + + +def _dt_func_map(func_name): + """ + Build function that apply specified method against ``dt`` property of the passed frame. + + Parameters + ---------- + func_name : str + Date-time function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied in the frame. + + Notes + ----- + This applies callable methods of ``Series.dt``. + """ + + def dt_op_builder(df, *args, **kwargs): + """Apply specified function against ``dt`` accessor of the passed frame.""" + dt_s = df.squeeze(axis=1).dt + dt_func_result = getattr(pandas.Series.dt, func_name)(dt_s, *args, **kwargs) + # If we don't specify the dtype for the frame, the frame might get the + # wrong dtype, e.g. for to_pydatetime in https://github.com/modin-project/modin/issues/4436 + return pandas.DataFrame(dt_func_result, dtype=dt_func_result.dtype) + + return dt_op_builder + + +def _rolling_func(func): + """ + Build function that apply specified rolling method of the passed frame. + + Parameters + ---------- + func : str + Rolling function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def rolling_builder(df, fold_axis, rolling_args, *args, **kwargs): + rolling_result = df.rolling(*rolling_args) + rolling_op = getattr(rolling_result, func) + return rolling_op(*args, **kwargs) + + return rolling_builder + + +def _reindex(df, axis, labels, **kwargs): # noqa: GL08 + return df.reindex(labels=labels, axis=axis, **kwargs) + + +def _concat(df, axis, other, join_axes=None, **kwargs): # noqa: GL08 + if not isinstance(other, list): + other = [other] + if ( + isinstance(df, pandas.DataFrame) + and len(df.columns) == 1 + and df.columns[0] == MODIN_UNNAMED_SERIES_LABEL + ): + df = df[df.columns[0]] + + ignore_index = kwargs.get("ignore_index", False) + concat_join = ["outer", "inner"] + if kwargs.get("join", "outer") in concat_join: + if not isinstance(other, list): + other = [other] + other = [df] + other + result = pandas.concat(other, axis=axis, **kwargs) + else: + if isinstance(other, (list, np.ndarray)) and len(other) == 1: + other = other[0] + ignore_index = kwargs.pop("ignore_index", None) + kwargs["how"] = kwargs.pop("join", None) + if isinstance(other, (pandas.DataFrame, pandas.Series)): + result = df.join(other, rsuffix="r_", **kwargs) + else: + result = df.join(other, **kwargs) + if ignore_index: + if axis == 0: + result = result.reset_index(drop=True) + else: + result.columns = pandas.RangeIndex(len(result.columns)) + return result + + +def _to_datetime(df, *args, **kwargs): # noqa: GL08 + return pandas.to_datetime(df.squeeze(axis=1), *args, **kwargs) + + +def _to_numeric(df, *args, **kwargs): # noqa: GL08 + return pandas.to_numeric(df.squeeze(axis=1), *args, **kwargs) + + +def _groupby(agg_name): + """ + Build function that apply specified groupby method of the passed frame. + + Parameters + ---------- + agg_name : str + GroupBy aggregate function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + __aggregation_methods_dict = { + "axis_wise": pandas.core.groupby.DataFrameGroupBy.aggregate, + "group_wise": pandas.core.groupby.DataFrameGroupBy.apply, + "transform": pandas.core.groupby.DataFrameGroupBy.transform, + } + + def groupby_callable( + df, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + agg_func=None, + how="axis_wise", + drop=False, + **kwargs, + ): + by_names = [] + if isinstance(by, pandas.DataFrame): + by = by.squeeze(axis=1) + if isinstance(by, list): + for i in range(len(by)): + if isinstance(by[i], pandas.DataFrame): + by[i] = by[i].squeeze(axis=1) + if isinstance(by[i], pandas.Series): + if isinstance(df.index, pandas.MultiIndex): + by[i].name = pandas.MultiIndex.from_tuples(by[i].name) + by_names.append(by[i].name) + elif isinstance(by[i], str): + by_names.append(by[i]) + if isinstance(by, pandas.DataFrame): + by_names = list(by.columns) + to_append = by.columns[[name not in df.columns for name in by_names]] + if len(to_append) > 0: + df = pandas.concat([df, by[to_append]], axis=1) + by = by_names + if isinstance(by, pandas.Series) and drop: + by_names = [by.name] + if ( + is_list_like(by) + and drop + and not any([is_list_like(curr_by) for curr_by in by]) + ): + by = by_names + + groupby_obj = df.groupby(by=by, axis=axis, **groupby_kwargs) + if agg_name == "agg": + if isinstance(agg_func, dict): + agg_func = { + k: v[0] if isinstance(v, list) and len(v) == 1 else v + for k, v in agg_func.items() + } + groupby_agg = __aggregation_methods_dict[how] + result = groupby_agg(groupby_obj, agg_func, *agg_args, **agg_kwargs) + else: + groupby_agg = getattr(groupby_obj, agg_name) + if callable(groupby_agg): + result = groupby_agg(*agg_args, **agg_kwargs) + else: + result = groupby_agg + + return result + + return groupby_callable + + +def _register_binary(op): + """ + Build function that apply specified binary method of the passed frame. + + Parameters + ---------- + op : str + Binary function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def binary_operator(df, other, **kwargs): + squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( + "squeeze_other", False + ) + squeeze_self = kwargs.pop("squeeze_self", False) + + if squeeze_other: + other = other.squeeze(axis=1) + + if squeeze_self: + df = df.squeeze(axis=1) + result = getattr(df, op)(other, **kwargs) + if ( + not isinstance(result, pandas.Series) + and not isinstance(result, pandas.DataFrame) + and is_list_like(result) + ): + result = pandas.DataFrame(result) + + return result + + return binary_operator + + +def _register_expanding(func): + """ + Build function that apply specified expanding window functions. + + Parameters + ---------- + func : str + Expanding window functionname to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def expanding_operator(df, fold_axis, rolling_args, *args, **kwargs): + squeeze_self = kwargs.pop("squeeze_self", False) + + if squeeze_self: + df = df.squeeze(axis=1) + roller = df.expanding(*rolling_args) + if type(func) is property: + return func.fget(roller) + + return func(roller, *args, **kwargs) + + return expanding_operator + + +def _register_resample(op): + """ + Build function that apply specified resample method of the passed frame. + + Parameters + ---------- + op : str + Resample function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def resample_operator(df, resample_kwargs, *args, **kwargs): + resampler = df.resample(**resample_kwargs) + result = getattr(resampler, op)(*args, **kwargs) + return result + + return resample_operator + + +def _drop(df, **kwargs): # noqa: GL08 + if ( + kwargs.get("labels", None) is not None + or kwargs.get("index", None) is not None + or kwargs.get("columns", None) is not None + ): + return df.drop(**kwargs) + return df + + +def _fillna(df, value, **kwargs): # noqa: GL08 + squeeze_self = kwargs.pop("squeeze_self", False) + squeeze_value = kwargs.pop("squeeze_value", False) + if squeeze_self and isinstance(df, pandas.DataFrame): + df = df.squeeze(axis=1) + if squeeze_value and isinstance(value, pandas.DataFrame): + value = value.squeeze(axis=1) + return df.fillna(value, **kwargs) + + +def _is_monotonic(monotonic_type): # noqa: GL08 + def is_monotonic_caller(ser): + return pandas.DataFrame([getattr(ser, monotonic_type)]) + + return is_monotonic_caller + + +def _sort_index(df, inplace=False, **kwargs): # noqa: GL08 + if inplace: + df.sort_index(inplace=inplace, **kwargs) + else: + df = df.sort_index(inplace=inplace, **kwargs) + return df + + +def _combine(df, other, func, **kwargs): # noqa: GL08 + if isinstance(df, pandas.Series): + return func(df, other) + return df.combine(other, func) + + +def _getitem_array(df, key): # noqa: GL08 + if isinstance(key, pandas.DataFrame): + key = key.squeeze(axis=1) + return df[key] + + +def _getitem_row_array(df, key): # noqa: GL08 + if isinstance(key, pandas.DataFrame): + key = key.squeeze(axis=1) + return df.iloc[key] + + +def _write_items( + df, + row_numeric_index, + col_numeric_index, + item, + need_columns_reindex=True, +): # noqa: GL08 + from modin.pandas.utils import broadcast_item, is_scalar + + if not isinstance(row_numeric_index, slice): + row_numeric_index = list(row_numeric_index) + if not isinstance(col_numeric_index, slice): + col_numeric_index = list(col_numeric_index) + if not is_scalar(item): + broadcasted_items, _ = broadcast_item( + df, + row_numeric_index, + col_numeric_index, + item, + need_columns_reindex=need_columns_reindex, + ) + else: + broadcasted_items = item + + if isinstance(df.iloc[row_numeric_index, col_numeric_index], pandas.Series): + broadcasted_items = broadcasted_items.squeeze() + df.iloc[row_numeric_index, col_numeric_index] = broadcasted_items + return df + + +def _setitem(df, axis, key, value): # noqa: GL08 + if is_scalar(key) and isinstance(value, pandas.DataFrame): + value = value.squeeze() + if not axis: + df[key] = value + else: + df.loc[key] = value + return df + + +def _delitem(df, key): # noqa: GL08 + return df.drop(columns=[key]) + + +def _get_dummies(df, columns, **kwargs): # noqa: GL08 + return pandas.get_dummies(df, columns=columns, **kwargs) + + +def _register_default_pandas( + func, + is_series=False, + squeeze_args=False, + squeeze_kwargs=False, + return_raw=False, + in_place=False, +): + """ + Build function that apply specified method of the passed frame. + + Parameters + ---------- + func : callable + Function to apply. + is_series : bool, default: False + If True, the passed frame will always be squeezed to a series. + squeeze_args : bool, default: False + If True, all passed arguments will be squeezed. + squeeze_kwargs : bool, default: False + If True, all passed key word arguments will be squeezed. + return_raw : bool, default: False + If True, and the result not DataFrame or Series it is returned as is without wrapping in query compiler. + in_place : bool, default: False + If True, the specified function will be applied on the passed frame in place. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def caller(query_compiler, *args, **kwargs): + df = query_compiler._modin_frame + if is_series: + df = df.squeeze(axis=1) + exclude_names = ["fold_axis", "dtypes"] + kwargs = kwargs.copy() + for name in exclude_names: + kwargs.pop(name, None) + args = try_cast_to_pandas(args, squeeze=squeeze_args) + kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs) + result = func(df, *args, **kwargs) + inplace_method = kwargs.get("inplace", False) + if in_place: + inplace_method = in_place + if inplace_method: + result = df + if return_raw and not isinstance(result, (pandas.Series, pandas.DataFrame)): + return result + if isinstance(result, pandas.Series): + if result.name is None: + result.name = MODIN_UNNAMED_SERIES_LABEL + result = result.to_frame() + + return query_compiler.__constructor__(result) + + return caller + + +@_inherit_docstrings(BaseQueryCompiler) +class NativeQueryCompiler(BaseQueryCompiler): + """ + Query compiler for the pandas storage format. + + This class translates common query compiler API into + native library functions (e.g., pandas) to execute operations + on small data depending on the threshold. + + Parameters + ---------- + pandas_frame : pandas.DataFrame + The pandas frame to query with the compiled queries. + shape_hint : {"row", "column", None}, default: None + Shape hint for frames known to be a column or a row, otherwise None. + """ + + _modin_frame: pandas.DataFrame + _shape_hint: Optional[str] + + def __init__(self, pandas_frame, shape_hint: Optional[str] = None): + assert NativeDataframeMode.get() == "Pandas" + if hasattr(pandas_frame, "_to_pandas"): + pandas_frame = pandas_frame._to_pandas() + if is_scalar(pandas_frame): + pandas_frame = pandas.DataFrame([pandas_frame]) + elif not isinstance(pandas_frame, pandas.DataFrame): + pandas_frame = pandas.DataFrame(pandas_frame) + + self._modin_frame = pandas_frame + self._shape_hint = shape_hint + + def execute(self): + pass + + @property + def frame_has_materialized_dtypes(self) -> bool: + """ + Check if the undelying dataframe has materialized dtypes. + + Returns + ------- + bool + """ + return True + + def set_frame_dtypes_cache(self, dtypes): + """ + Set dtypes cache for the underlying dataframe frame. + + Parameters + ---------- + dtypes : pandas.Series, ModinDtypes, callable or None + + Notes + ----- + This function is for consistency with other QCs, + dtypes should be assigned directly on the frame. + """ + pass + + def set_frame_index_cache(self, index): + """ + Set index cache for underlying dataframe. + + Parameters + ---------- + index : sequence, callable or None + + Notes + ----- + This function is for consistency with other QCs, + index should be assigned directly on the frame. + """ + pass + + @property + def frame_has_index_cache(self): + """ + Check if the index cache exists for underlying dataframe. + + Returns + ------- + bool + """ + return True + + @property + def frame_has_dtypes_cache(self) -> bool: + """ + Check if the dtypes cache exists for the underlying dataframe. + + Returns + ------- + bool + """ + return True + + def take_2d_positional(self, index=None, columns=None): + index = slice(None) if index is None else index + columns = slice(None) if columns is None else columns + return self.__constructor__(self._modin_frame.iloc[index, columns]) + + def copy(self): + return self.__constructor__(self._modin_frame.copy()) + + def setitem_bool(self, row_loc, col_loc, item): + + self._modin_frame.loc[row_loc._modin_frame.squeeze(axis=1), col_loc] = item + return self.__constructor__(self._modin_frame) + + __and__ = _register_default_pandas(pandas.DataFrame.__and__) + __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) + __eq__ = _register_default_pandas(pandas.DataFrame.__eq__) + __format__ = _register_default_pandas(pandas.DataFrame.__format__) + __ge__ = _register_default_pandas(pandas.DataFrame.__ge__) + __gt__ = _register_default_pandas(pandas.DataFrame.__gt__) + __le__ = _register_default_pandas(pandas.DataFrame.__le__) + __lt__ = _register_default_pandas(pandas.DataFrame.__lt__) + __ne__ = _register_default_pandas(pandas.DataFrame.__ne__) + __or__ = _register_default_pandas(pandas.DataFrame.__or__) + __rand__ = _register_default_pandas(pandas.DataFrame.__rand__) + __reduce__ = _register_default_pandas(pandas.DataFrame.__reduce__, return_raw=True) + __reduce_ex__ = _register_default_pandas( + pandas.DataFrame.__reduce_ex__, return_raw=True + ) + __ror__ = _register_default_pandas(pandas.DataFrame.__ror__) + __rxor__ = _register_default_pandas(pandas.DataFrame.__rxor__) + __sizeof__ = _register_default_pandas(pandas.DataFrame.__sizeof__) + __xor__ = _register_default_pandas(pandas.DataFrame.__xor__) + abs = _register_default_pandas(pandas.DataFrame.abs) + add = _register_default_pandas(_register_binary("add")) + all = _register_default_pandas(pandas.DataFrame.all) + any = _register_default_pandas(pandas.DataFrame.any) + apply = _register_default_pandas(pandas.DataFrame.apply) + apply_on_series = _register_default_pandas(pandas.Series.apply, is_series=True) + applymap = _register_default_pandas(pandas.DataFrame.applymap) + astype = _register_default_pandas(pandas.DataFrame.astype) + case_when = _register_default_pandas(pandas.Series.case_when) + cat_codes = _register_default_pandas(lambda ser: ser.cat.codes, is_series=True) + combine = _register_default_pandas(_combine) + combine_first = _register_default_pandas(lambda df, other: df.combine_first(other)) + compare = _register_default_pandas(pandas.DataFrame.compare) + concat = _register_default_pandas(_concat) + conj = _register_default_pandas( + lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)) + ) + convert_dtypes = _register_default_pandas(pandas.DataFrame.convert_dtypes) + count = _register_default_pandas(pandas.DataFrame.count) + corr = _register_default_pandas(pandas.DataFrame.corr) + cov = _register_default_pandas(pandas.DataFrame.cov) + cummax = _register_default_pandas(pandas.DataFrame.cummax) + cummin = _register_default_pandas(pandas.DataFrame.cummin) + cumprod = _register_default_pandas(pandas.DataFrame.cumprod) + cumsum = _register_default_pandas(pandas.DataFrame.cumsum) + delitem = _register_default_pandas(_delitem) + df_update = _register_default_pandas(pandas.DataFrame.update, in_place=True) + diff = _register_default_pandas(pandas.DataFrame.diff) + dot = _register_default_pandas(_register_binary("dot")) + drop = _register_default_pandas(_drop) + dropna = _register_default_pandas(pandas.DataFrame.dropna) # axis values switched? + dt_ceil = _register_default_pandas(_dt_func_map("ceil")) + dt_components = _register_default_pandas(_dt_prop_map("components")) + dt_date = _register_default_pandas(_dt_prop_map("date")) + dt_day = _register_default_pandas(_dt_prop_map("day")) + dt_day_name = _register_default_pandas(_dt_func_map("day_name")) + dt_dayofweek = _register_default_pandas(_dt_prop_map("dayofweek")) + dt_dayofyear = _register_default_pandas(_dt_prop_map("dayofyear")) + dt_days = _register_default_pandas(_dt_prop_map("days")) + dt_days_in_month = _register_default_pandas(_dt_prop_map("days_in_month")) + dt_daysinmonth = _register_default_pandas(_dt_prop_map("daysinmonth")) + dt_end_time = _register_default_pandas(_dt_prop_map("end_time")) + dt_floor = _register_default_pandas(_dt_func_map("floor")) + dt_freq = _register_default_pandas( + lambda df: pandas.DataFrame([df.squeeze(axis=1).dt.freq]) + ) + dt_hour = _register_default_pandas(_dt_prop_map("hour")) + dt_is_leap_year = _register_default_pandas(_dt_prop_map("is_leap_year")) + dt_is_month_end = _register_default_pandas(_dt_prop_map("is_month_end")) + dt_is_month_start = _register_default_pandas(_dt_prop_map("is_month_start")) + dt_is_quarter_end = _register_default_pandas(_dt_prop_map("is_quarter_end")) + dt_is_quarter_start = _register_default_pandas(_dt_prop_map("is_quarter_start")) + dt_is_year_end = _register_default_pandas(_dt_prop_map("is_year_end")) + dt_is_year_start = _register_default_pandas(_dt_prop_map("is_year_start")) + dt_microsecond = _register_default_pandas(_dt_prop_map("microsecond")) + dt_microseconds = _register_default_pandas(_dt_prop_map("microseconds")) + dt_minute = _register_default_pandas(_dt_prop_map("minute")) + dt_month = _register_default_pandas(_dt_prop_map("month")) + dt_month_name = _register_default_pandas(_dt_func_map("month_name")) + dt_nanosecond = _register_default_pandas(_dt_prop_map("nanosecond")) + dt_nanoseconds = _register_default_pandas(_dt_prop_map("nanoseconds")) + dt_normalize = _register_default_pandas(_dt_func_map("normalize")) + dt_quarter = _register_default_pandas(_dt_prop_map("quarter")) + dt_qyear = _register_default_pandas(_dt_prop_map("qyear")) + dt_round = _register_default_pandas(_dt_func_map("round")) + dt_second = _register_default_pandas(_dt_prop_map("second")) + dt_seconds = _register_default_pandas(_dt_prop_map("seconds")) + dt_start_time = _register_default_pandas(_dt_prop_map("start_time")) + dt_strftime = _register_default_pandas(_dt_func_map("strftime")) + dt_time = _register_default_pandas(_dt_prop_map("time")) + dt_timetz = _register_default_pandas(_dt_prop_map("timetz")) + dt_to_period = _register_default_pandas(_dt_func_map("to_period")) + dt_to_pydatetime = _register_default_pandas(_dt_func_map("to_pydatetime")) + dt_to_pytimedelta = _register_default_pandas(_dt_func_map("to_pytimedelta")) + dt_to_timestamp = _register_default_pandas(_dt_func_map("to_timestamp")) + dt_total_seconds = _register_default_pandas(_dt_func_map("total_seconds")) + dt_tz = _register_default_pandas( + lambda df: pandas.DataFrame([df.squeeze(axis=1).dt.tz]) + ) + dt_tz_convert = _register_default_pandas(_dt_func_map("tz_convert")) + dt_tz_localize = _register_default_pandas(_dt_func_map("tz_localize")) + dt_week = _register_default_pandas(_dt_prop_map("week")) + dt_weekday = _register_default_pandas(_dt_prop_map("weekday")) + dt_weekofyear = _register_default_pandas(_dt_prop_map("weekofyear")) + dt_year = _register_default_pandas(_dt_prop_map("year")) + duplicated = _register_default_pandas(pandas.DataFrame.duplicated) + eq = _register_default_pandas(_register_binary("eq")) + equals = _register_default_pandas(_register_binary("equals")) + eval = _register_default_pandas(pandas.DataFrame.eval) + explode = _register_default_pandas(pandas.DataFrame.explode) + expanding_count = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.count) + ) + expanding_sum = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.sum) + ) + expanding_mean = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.mean) + ) + expanding_median = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.median) + ) + expanding_std = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.std) + ) + expanding_min = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.min) + ) + expanding_max = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.max) + ) + expanding_skew = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.skew) + ) + expanding_kurt = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.kurt) + ) + expanding_sem = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.sem) + ) + expanding_quantile = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.quantile) + ) + expanding_aggregate = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.aggregate) + ) + expanding_var = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.var) + ) + expanding_rank = _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.rank) + ) + + fillna = _register_default_pandas(_fillna) + first_valid_index = _register_default_pandas( + pandas.DataFrame.first_valid_index, return_raw=True + ) + floordiv = _register_default_pandas(_register_binary("floordiv")) + ge = _register_default_pandas(_register_binary("ge")) + get_dummies = _register_default_pandas(_get_dummies) + getitem_array = _register_default_pandas(_getitem_array) + getitem_row_array = _register_default_pandas(_getitem_row_array) + groupby_agg = _register_default_pandas(_groupby("agg")) + groupby_all = _register_default_pandas(_groupby("all")) + groupby_any = _register_default_pandas(_groupby("any")) + groupby_count = _register_default_pandas(_groupby("count")) + groupby_cummax = _register_default_pandas(_groupby("cummax")) + groupby_cummin = _register_default_pandas(_groupby("cummin")) + groupby_cumprod = _register_default_pandas(_groupby("cumprod")) + groupby_cumsum = _register_default_pandas(_groupby("cumsum")) + groupby_dtypes = _register_default_pandas(_groupby("dtypes")) + groupby_fillna = _register_default_pandas(_groupby("fillna")) + groupby_max = _register_default_pandas(_groupby("max")) + groupby_mean = _register_default_pandas(_groupby("mean")) + groupby_median = _register_default_pandas(_groupby("median")) + groupby_min = _register_default_pandas(_groupby("min")) + groupby_nunique = _register_default_pandas(_groupby("nunique")) + groupby_prod = _register_default_pandas(_groupby("prod")) + groupby_quantile = _register_default_pandas(_groupby("quantile")) + groupby_rank = _register_default_pandas(_groupby("rank")) + groupby_shift = _register_default_pandas(_groupby("shift")) + groupby_skew = _register_default_pandas(_groupby("skew")) + groupby_std = _register_default_pandas(_groupby("std")) + groupby_sum = _register_default_pandas(_groupby("sum")) + groupby_var = _register_default_pandas(_groupby("var")) + gt = _register_default_pandas(_register_binary("gt")) + idxmax = _register_default_pandas(pandas.DataFrame.idxmax) + idxmin = _register_default_pandas(pandas.DataFrame.idxmin) + infer_objects = _register_default_pandas( + pandas.DataFrame.infer_objects, return_raw=True + ) + insert = _register_default_pandas( + pandas.DataFrame.insert, in_place=True, squeeze_args=True + ) + invert = _register_default_pandas(pandas.DataFrame.__invert__) + is_monotonic = _register_default_pandas( + _is_monotonic("is_monotonic"), is_series=True + ) + is_monotonic_decreasing = _register_default_pandas( + _is_monotonic("is_monotonic_decreasing"), is_series=True + ) + is_monotonic_increasing = _register_default_pandas( + _is_monotonic("is_monotonic_increasing"), is_series=True + ) + isna = _register_default_pandas(pandas.DataFrame.isna) + join = _register_default_pandas(pandas.DataFrame.join) + kurt = _register_default_pandas(pandas.DataFrame.kurt, return_raw=True) + last_valid_index = _register_default_pandas( + pandas.DataFrame.last_valid_index, return_raw=True + ) + le = _register_default_pandas(_register_binary("le")) + lt = _register_default_pandas(_register_binary("lt")) + # mad = _register_default_pandas(pandas.DataFrame.mad) + mask = _register_default_pandas(pandas.DataFrame.mask) + max = _register_default_pandas(pandas.DataFrame.max) + map = _register_default_pandas(pandas.DataFrame.map) + mean = _register_default_pandas(pandas.DataFrame.mean, return_raw=True) + median = _register_default_pandas(pandas.DataFrame.median, return_raw=True) + melt = _register_default_pandas(pandas.DataFrame.melt) + memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) + merge = _register_default_pandas(pandas.DataFrame.merge) + min = _register_default_pandas(pandas.DataFrame.min) + mod = _register_default_pandas(_register_binary("mod")) + mode = _register_default_pandas(pandas.DataFrame.mode) + mul = _register_default_pandas(_register_binary("mul")) + ne = _register_default_pandas(_register_binary("ne")) + negative = _register_default_pandas(pandas.DataFrame.__neg__) + nlargest = _register_default_pandas(pandas.DataFrame.nlargest) + notna = _register_default_pandas(pandas.DataFrame.notna) + nsmallest = _register_default_pandas(lambda df, **kwargs: df.nsmallest(**kwargs)) + nunique = _register_default_pandas(pandas.DataFrame.nunique) + pivot = _register_default_pandas(pandas.DataFrame.pivot) + pivot_table = _register_default_pandas(pandas.DataFrame.pivot_table) + pow = _register_default_pandas(_register_binary("pow")) + prod = _register_default_pandas(pandas.DataFrame.prod) + prod_min_count = _register_default_pandas(pandas.DataFrame.prod) + quantile_for_list_of_values = _register_default_pandas(pandas.DataFrame.quantile) + quantile_for_single_value = _register_default_pandas(pandas.DataFrame.quantile) + query = _register_default_pandas(pandas.DataFrame.query) + radd = _register_default_pandas(_register_binary("radd")) + rank = _register_default_pandas(pandas.DataFrame.rank) + reindex = _register_default_pandas(_reindex) + repeat = _register_default_pandas(pandas.Series.repeat, is_series=True) + replace = _register_default_pandas(pandas.DataFrame.replace) + resample_agg_df = _register_default_pandas(_register_resample("agg")) + resample_agg_ser = _register_default_pandas( + _register_resample("agg"), is_series=True + ) + resample_app_df = _register_default_pandas(_register_resample("apply")) + resample_app_ser = _register_default_pandas( + _register_resample("apply"), is_series=True + ) + resample_asfreq = _register_default_pandas(_register_resample("asfreq")) + resample_backfill = _register_default_pandas(_register_resample("backfill")) + resample_bfill = _register_default_pandas(_register_resample("bfill")) + resample_count = _register_default_pandas(_register_resample("count")) + resample_ffill = _register_default_pandas(_register_resample("ffill")) + resample_fillna = _register_default_pandas(_register_resample("fillna")) + resample_first = _register_default_pandas(_register_resample("first")) + resample_get_group = _register_default_pandas(_register_resample("get_group")) + resample_interpolate = _register_default_pandas(_register_resample("interpolate")) + resample_last = _register_default_pandas(_register_resample("last")) + resample_max = _register_default_pandas(_register_resample("max")) + resample_mean = _register_default_pandas(_register_resample("mean")) + resample_median = _register_default_pandas(_register_resample("median")) + resample_min = _register_default_pandas(_register_resample("min")) + resample_nearest = _register_default_pandas(_register_resample("nearest")) + resample_nunique = _register_default_pandas(_register_resample("nunique")) + resample_ohlc_df = _register_default_pandas(_register_resample("ohlc")) + resample_ohlc_ser = _register_default_pandas( + _register_resample("ohlc"), is_series=True + ) + resample_pad = _register_default_pandas(_register_resample("pad")) + resample_pipe = _register_default_pandas(_register_resample("pipe")) + resample_prod = _register_default_pandas(_register_resample("prod")) + resample_quantile = _register_default_pandas(_register_resample("quantile")) + resample_sem = _register_default_pandas(_register_resample("sem")) + resample_size = _register_default_pandas(_register_resample("size")) + resample_std = _register_default_pandas(_register_resample("std")) + resample_sum = _register_default_pandas(_register_resample("sum")) + resample_transform = _register_default_pandas(_register_resample("transform")) + resample_var = _register_default_pandas(_register_resample("var")) + reset_index = _register_default_pandas(pandas.DataFrame.reset_index) + rfloordiv = _register_default_pandas(_register_binary("rfloordiv")) + rmod = _register_default_pandas(_register_binary("rmod")) + rolling_aggregate = _register_default_pandas(_rolling_func("aggregate")) + rolling_apply = _register_default_pandas(_rolling_func("apply")) + rolling_corr = _register_default_pandas(_rolling_func("corr")) + rolling_count = _register_default_pandas(_rolling_func("count")) + rolling_cov = _register_default_pandas(_rolling_func("cov")) + rolling_kurt = _register_default_pandas(_rolling_func("kurt")) + rolling_max = _register_default_pandas(_rolling_func("max")) + rolling_mean = _register_default_pandas(_rolling_func("mean")) + rolling_median = _register_default_pandas(_rolling_func("median")) + rolling_min = _register_default_pandas(_rolling_func("min")) + rolling_quantile = _register_default_pandas(_rolling_func("quantile")) + rolling_skew = _register_default_pandas(_rolling_func("skew")) + rolling_std = _register_default_pandas(_rolling_func("std")) + rolling_sum = _register_default_pandas(_rolling_func("sum")) + rolling_var = _register_default_pandas(_rolling_func("var")) + round = _register_default_pandas(pandas.DataFrame.round) + rmul = _register_default_pandas(_register_binary("rmul")) + rpow = _register_default_pandas(_register_binary("rpow")) + rsub = _register_default_pandas(_register_binary("rsub")) + rtruediv = _register_default_pandas(_register_binary("rtruediv")) + searchsorted = _register_default_pandas(pandas.Series.searchsorted, is_series=True) + sem = _register_default_pandas(pandas.DataFrame.sem) + series_view = _register_default_pandas(pandas.Series.view, is_series=True) + set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) + setitem = _register_default_pandas(_setitem) + skew = _register_default_pandas(pandas.DataFrame.skew, return_raw=True) + sort_index = _register_default_pandas(_sort_index) + sort_columns_by_row_values = _register_default_pandas( + lambda df, columns, **kwargs: df.sort_values(by=columns, axis=1, **kwargs) + ) + sort_rows_by_column_values = _register_default_pandas( + lambda df, columns, **kwargs: df.sort_values(by=columns, axis=0, **kwargs) + ) + stack = _register_default_pandas(pandas.DataFrame.stack) + std = _register_default_pandas(pandas.DataFrame.std) + str___getitem__ = _register_default_pandas(_str_map("__getitem__")) + str_capitalize = _register_default_pandas(_str_map("capitalize")) + str_center = _register_default_pandas(_str_map("center")) + str_contains = _register_default_pandas(_str_map("contains")) + str_count = _register_default_pandas(_str_map("count")) + str_endswith = _register_default_pandas(_str_map("endswith")) + str_find = _register_default_pandas(_str_map("find")) + str_findall = _register_default_pandas(_str_map("findall")) + str_get = _register_default_pandas(_str_map("get")) + str_index = _register_default_pandas(_str_map("index")) + str_isalnum = _register_default_pandas(_str_map("isalnum")) + str_isalpha = _register_default_pandas(_str_map("isalpha")) + str_isdecimal = _register_default_pandas(_str_map("isdecimal")) + str_isdigit = _register_default_pandas(_str_map("isdigit")) + str_islower = _register_default_pandas(_str_map("islower")) + str_isnumeric = _register_default_pandas(_str_map("isnumeric")) + str_isspace = _register_default_pandas(_str_map("isspace")) + str_istitle = _register_default_pandas(_str_map("istitle")) + str_isupper = _register_default_pandas(_str_map("isupper")) + str_join = _register_default_pandas(_str_map("join")) + str_len = _register_default_pandas(_str_map("len")) + str_ljust = _register_default_pandas(_str_map("ljust")) + str_lower = _register_default_pandas(_str_map("lower")) + str_lstrip = _register_default_pandas(_str_map("lstrip")) + str_match = _register_default_pandas(_str_map("match")) + str_normalize = _register_default_pandas(_str_map("normalize")) + str_pad = _register_default_pandas(_str_map("pad")) + str_partition = _register_default_pandas(_str_map("partition")) + str_repeat = _register_default_pandas(_str_map("repeat")) + str_replace = _register_default_pandas(_str_map("replace")) + str_rfind = _register_default_pandas(_str_map("rfind")) + str_rindex = _register_default_pandas(_str_map("rindex")) + str_rjust = _register_default_pandas(_str_map("rjust")) + str_rpartition = _register_default_pandas(_str_map("rpartition")) + str_rsplit = _register_default_pandas(_str_map("rsplit")) + str_rstrip = _register_default_pandas(_str_map("rstrip")) + str_slice = _register_default_pandas(_str_map("slice")) + str_slice_replace = _register_default_pandas(_str_map("slice_replace")) + str_split = _register_default_pandas(_str_map("split")) + str_startswith = _register_default_pandas(_str_map("startswith")) + str_strip = _register_default_pandas(_str_map("strip")) + str_swapcase = _register_default_pandas(_str_map("swapcase")) + str_title = _register_default_pandas(_str_map("title")) + str_translate = _register_default_pandas(_str_map("translate")) + str_upper = _register_default_pandas(_str_map("upper")) + str_wrap = _register_default_pandas(_str_map("wrap")) + str_zfill = _register_default_pandas(_str_map("zfill")) + sub = _register_default_pandas(_register_binary("sub")) + sum = _register_default_pandas(pandas.DataFrame.sum) + sum_min_count = _register_default_pandas(pandas.DataFrame.sum) + to_datetime = _register_default_pandas(_to_datetime) + to_numeric = _register_default_pandas(_to_numeric) + to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_raw=True) + to_timedelta = _register_default_pandas( + lambda ser, *args, **kwargs: pandas.to_timedelta(ser, *args, **kwargs), + is_series=True, + ) + transpose = _register_default_pandas(pandas.DataFrame.transpose) + truediv = _register_default_pandas(_register_binary("truediv")) + unstack = _register_default_pandas(pandas.DataFrame.unstack) + var = _register_default_pandas(pandas.DataFrame.var) + where = _register_default_pandas(pandas.DataFrame.where) + window_mean = _register_default_pandas(_rolling_func("mean")) + window_std = _register_default_pandas(_rolling_func("std")) + window_sum = _register_default_pandas(_rolling_func("sum")) + window_var = _register_default_pandas(_rolling_func("var")) + write_items = _register_default_pandas(_write_items) + + T = property(transpose) + + add_prefix = _register_default_pandas(pandas.DataFrame.add_prefix) + add_suffix = _register_default_pandas(pandas.DataFrame.add_suffix) + + def clip(self, lower, upper, **kwargs): + if isinstance(lower, BaseQueryCompiler): + lower = lower.to_pandas().squeeze(1) + if isinstance(upper, BaseQueryCompiler): + upper = upper.to_pandas().squeeze(1) + return _register_default_pandas(pandas.DataFrame.clip)( + self, lower, upper, **kwargs + ) + + def describe(self, percentiles: np.ndarray): + return _register_default_pandas(pandas.DataFrame.describe)( + self, + percentiles=percentiles, + include="all", + ) + + def series_update(self, other, **kwargs): + return _register_default_pandas(_register_binary("update"), in_place=True)( + self, + other=other, + squeeze_self=True, + squeeze_other=True, + **kwargs, + ) + + def expanding_cov( + self, + fold_axis, + expanding_args, + squeeze_self, + squeeze_other, + other=None, + pairwise=None, + ddof=1, + numeric_only=False, + **kwargs, + ): + other_for_default = ( + other + if other is None + else ( + other.to_pandas().squeeze(axis=1) + if squeeze_other + else other.to_pandas() + ) + ) + return _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.cov) + )( + self, + fold_axis, + expanding_args, + other=other_for_default, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + squeeze_self=squeeze_self, + **kwargs, + ) + + def expanding_corr( + self, + fold_axis, + expanding_args, + squeeze_self, + squeeze_other, + other=None, + pairwise=None, + ddof=1, + numeric_only=False, + **kwargs, + ): + other_for_default = ( + other + if other is None + else ( + other.to_pandas().squeeze(axis=1) + if squeeze_other + else other.to_pandas() + ) + ) + return _register_default_pandas( + _register_expanding(pandas.core.window.expanding.Expanding.corr) + )( + self, + fold_axis, + expanding_args, + other=other_for_default, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + squeeze_self=squeeze_self, + **kwargs, + ) + + def groupby_size( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + result = _register_default_pandas(_groupby("size"))( + self, + by=by, + axis=axis, + groupby_kwargs=groupby_kwargs, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + drop=drop, + method="size", + ) + if not groupby_kwargs.get("as_index", False): + # Renaming 'MODIN_UNNAMED_SERIES_LABEL' to a proper name + + result.columns = result.columns[:-1].append(pandas.Index(["size"])) + return result + + def get_axis(self, axis): + return self._modin_frame.index if axis == 0 else self._modin_frame.columns + + def get_index_name(self, axis=0): + return self.get_axis(axis).name + + def get_index_names(self, axis=0): + return self.get_axis(axis).names + + def set_index_name(self, name, axis=0): + self.get_axis(axis).name = name + + def has_multiindex(self, axis=0): + if axis == 0: + return isinstance(self._modin_frame.index, pandas.MultiIndex) + assert axis == 1 + return isinstance(self._modin_frame.columns, pandas.MultiIndex) + + def isin(self, values, ignore_indices=False, **kwargs): + if isinstance(values, type(self)) and ignore_indices: + # Pandas logic is that it ignores indexing if 'values' is a 1D object + values = values.to_pandas().squeeze(axis=1) + if self._shape_hint == "column": + return _register_default_pandas(pandas.Series.isin, is_series=True)( + self, values, **kwargs + ) + else: + return _register_default_pandas(pandas.DataFrame.isin)( + self, values, **kwargs + ) + + def to_pandas(self): + return self._modin_frame + + @classmethod + def from_pandas(cls, df, data_cls): + return cls(df) + + @classmethod + def from_arrow(cls, at, data_cls): + return cls(at.to_pandas()) + + def free(self): + return + + def finalize(self): + return + + # Dataframe exchange protocol + + def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): + return self._modin_frame.__dataframe__( + nan_as_null=nan_as_null, allow_copy=allow_copy + ) + + @classmethod + def from_dataframe(cls, df, data_cls): + return cls(pandas.api.interchange.from_dataframe(df)) + + # END Dataframe exchange protocol + + index = property(_get_axis(0), _set_axis(0)) + columns = property(_get_axis(1), _set_axis(1)) + + @property + def dtypes(self): + return self._modin_frame.dtypes + + def getitem_column_array(self, key, numeric=False, ignore_order=False): + if numeric: + return self.__constructor__(self._modin_frame.iloc[:, key]) + return self.__constructor__(self._modin_frame.loc[:, key]) + + def is_series_like(self): + return len(self._modin_frame.columns) == 1 or len(self._modin_frame.index) == 1 + + def support_materialization_in_worker_process(self) -> bool: + """ + Whether it's possible to call function `to_pandas` during the pickling process, at the moment of recreating the object. + + Returns + ------- + bool + """ + return False + + def get_pandas_backend(self) -> Optional[str]: + """ + Get backend stored in `_modin_frame`. + + Returns + ------- + str | None + Backend name. + """ + return None diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 108e2620aac..ea67592097c 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat +from modin.config import NativeDataframeMode, NPartitions, StorageFormat from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -210,6 +210,10 @@ def operation(df): StorageFormat.get() != "Pandas", reason="Modin on this engine doesn't create virtual partitions.", ) +@pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler does not contain partitions.", +) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] ) diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 697a0d7f120..71f49924c94 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -22,7 +22,7 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat +from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -123,6 +123,10 @@ def test_to_numpy(data): assert_array_equal(modin_df.values, pandas_df.values) +@pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler does not contain partitions.", +) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): frame = pd.DataFrame(data) @@ -294,8 +298,8 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - - assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) + if NativeDataframeMode.get() == "Default": + assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) ) @@ -313,6 +317,10 @@ def test_corr_non_numeric(self, numeric_only): StorageFormat.get() != "Pandas", reason="doesn't make sense for non-partitioned executions", ) + @pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler does not contain partitions.", + ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition modin_df, pandas_df = create_test_dfs( @@ -602,7 +610,13 @@ def test_pivot(data, index, columns, values, request): in request.node.callspec.id or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id - or (current_execution in ("BaseOnPython",) and index is lib.no_default) + or ( + ( + current_execution in ("BaseOnPython",) + or NativeDataframeMode.get() == "Pandas" + ) + and index is lib.no_default + ) ): pytest.xfail(reason="https://github.com/modin-project/modin/issues/7010") @@ -980,7 +994,8 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): "DateColumn", marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") - and StorageFormat.get() != "Base", + and StorageFormat.get() != "Base" + and NativeDataframeMode.get() == "Default", reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 52603343619..a47474eb76c 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,7 +21,7 @@ from pandas._testing import ensure_clean import modin.pandas as pd -from modin.config import MinRowPartitionSize, NPartitions +from modin.config import MinRowPartitionSize, NativeDataframeMode, NPartitions from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( @@ -586,6 +586,10 @@ def test_loc_setting_single_categorical_column(): df_equals(modin_df, pandas_df) +@pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler does not currently support IO functions.", +) def test_loc_multi_index(): modin_df = pd.read_csv( "modin/tests/pandas/data/blah.csv", header=[0, 1, 2, 3], index_col=0 @@ -2238,6 +2242,10 @@ def test___setitem__partitions_aligning(): df_equals(md_df, pd_df) +@pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler does not currently support IO functions.", +) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",") diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py index b00ae056920..38ab70524a2 100644 --- a/modin/tests/pandas/dataframe/test_iter.py +++ b/modin/tests/pandas/dataframe/test_iter.py @@ -142,7 +142,9 @@ def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame def test___finalize__(): data = test_data_values[0] - with warns_that_defaulting_to_pandas(): + # Using `force` for `NativeDataframeMode` as the warnings are raised at the API layer, + # before geting into the Query Compiler layer. + with warns_that_defaulting_to_pandas(force=True): pd.DataFrame(data).__finalize__(None) @@ -230,7 +232,9 @@ def test___repr__(): "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) - with warns_that_defaulting_to_pandas(): + # Using `force` for `NativeDataframeMode` as the warnings are raised at the API layer, + # before geting into the Query Compiler layer. + with warns_that_defaulting_to_pandas(force=True): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df) diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 670eb9ff911..06ee419e6ec 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -19,7 +19,7 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat +from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, @@ -732,7 +732,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -772,7 +772,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -892,7 +892,8 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( - Engine.get() not in ("Ray", "Unidist", "Dask"), + Engine.get() not in ("Ray", "Unidist", "Dask") + or NativeDataframeMode.get() == "Pandas", reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index d6980cd6761..07b195bdafa 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -19,7 +19,12 @@ import pytest import modin.pandas as pd -from modin.config import MinRowPartitionSize, NPartitions, StorageFormat +from modin.config import ( + MinRowPartitionSize, + NativeDataframeMode, + NPartitions, + StorageFormat, +) from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.pandas.testing import assert_index_equal, assert_series_equal @@ -299,7 +304,10 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if get_current_execution() != "BaseOnPython": + if ( + get_current_execution() != "BaseOnPython" + and NativeDataframeMode.get() == "Default" + ): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -565,6 +573,10 @@ def test_astype_int64_to_astype_category_github_issue_6259(): get_current_execution() == "BaseOnPython", reason="BaseOnPython doesn't have proxy categories", ) +@pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler doesn't have proxy categories", +) class TestCategoricalProxyDtype: """This class contains test and test usilities for the ``LazyProxyCategoricalDtype`` class.""" @@ -787,6 +799,10 @@ def comparator(df1, df2): ) +@pytest.mark.skipif( + NativeDataframeMode.get() == "Pandas", + reason="NativeQueryCompiler does not contain partitions.", +) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype modin_part1 = pd.DataFrame(["a"]).convert_dtypes() @@ -811,7 +827,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/modin/tests/pandas/dataframe/test_pickle.py b/modin/tests/pandas/dataframe/test_pickle.py index 97c78c9cd74..5450ca4f26c 100644 --- a/modin/tests/pandas/dataframe/test_pickle.py +++ b/modin/tests/pandas/dataframe/test_pickle.py @@ -52,7 +52,6 @@ def test__reduce__(): # pre-processed for the distributed engine. dataframe_data = ["Major League Baseball", "National Basketball Association"] abbr_md, abbr_pd = create_test_dfs(dataframe_data, index=["MLB", "NBA"]) - # breakpoint() dataframe_data = { "name": ["Mariners", "Lakers"] * 500, diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index bc478d957f9..1597b052853 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib import json from textwrap import dedent, indent from unittest.mock import Mock, patch @@ -21,6 +22,7 @@ import modin.pandas as pd import modin.utils +from modin.config import NativeDataframeMode from modin.error_message import ErrorMessage from modin.tests.pandas.utils import create_test_dfs @@ -248,7 +250,7 @@ def test_format_string(): assert answer == expected -def warns_that_defaulting_to_pandas(prefix=None, suffix=None): +def warns_that_defaulting_to_pandas(prefix=None, suffix=None, force=False): """ Assert that code warns that it's defaulting to pandas. @@ -260,13 +262,21 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None): suffix : Optional[str] If specified, checks that the end of the warning message matches this argument after "[Dd]efaulting to pandas". + force : Optional[bool] + If ``True``, return the ``pytest.recwarn.WarningsChecker`` irrespective of ``NativeDataframeMode``. Returns ------- - pytest.recwarn.WarningsChecker - A WarningsChecker checking for a UserWarning saying that Modin is - defaulting to Pandas. + pytest.recwarn.WarningsChecker or contextlib.nullcontext + If Modin is not operating in ``NativeDataframeMode``, a ``WarningsChecker`` + is returned, which will check for a ``UserWarning`` indicating that Modin + is defaulting to Pandas. If ``NativeDataframeMode`` is set, a + ``nullcontext`` is returned to avoid the warning about defaulting to Pandas, + as this occurs due to user setting of ``NativeDataframeMode``. """ + if NativeDataframeMode.get() == "Pandas" and not force: + return contextlib.nullcontext() + match = "[Dd]efaulting to pandas" if prefix: # Message may be separated by newlines From 82499151f8fecc4c858695cc75ecd2f164457c32 Mon Sep 17 00:00:00 2001 From: Kirill Suvorov Date: Mon, 26 Aug 2024 15:31:23 +0200 Subject: [PATCH 12/20] FEAT-#7337: Using dynamic partitionning in `broadcast_apply` (#7338) Signed-off-by: Kirill Suvorov --- modin/core/dataframe/algebra/groupby.py | 8 ++- .../pandas/partitioning/partition_manager.py | 59 ++++++++++++++++++- .../storage_formats/pandas/query_compiler.py | 4 +- modin/tests/pandas/test_groupby.py | 33 ++++++++++- 4 files changed, 95 insertions(+), 9 deletions(-) diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py index cc9196a422a..fec0fe3c6ac 100644 --- a/modin/core/dataframe/algebra/groupby.py +++ b/modin/core/dataframe/algebra/groupby.py @@ -655,9 +655,11 @@ def aggregate_on_dict(grp_obj, *args, **kwargs): ) native_res_part = [] if native_agg_res is None else [native_agg_res] - result = pandas.concat( - [*native_res_part, *custom_results], axis=1, copy=False - ) + parts = [*native_res_part, *custom_results] + if parts: + result = pandas.concat(parts, axis=1, copy=False) + else: + result = pandas.DataFrame(columns=result_columns) # The order is naturally preserved if there's no custom aggregations if preserve_aggregation_order and len(custom_aggs): diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 05854239206..cb207f64d4e 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -440,7 +440,7 @@ def get_partitions(index): @classmethod @wait_computations_if_benchmark_mode - def broadcast_apply(cls, axis, apply_func, left, right): + def base_broadcast_apply(cls, axis, apply_func, left, right): """ Broadcast the `right` partitions to `left` and apply `apply_func` function. @@ -504,6 +504,7 @@ def broadcast_axis_partitions( keep_partitioning=False, num_splits=None, apply_indices=None, + broadcast_all=True, enumerate_partitions=False, lengths=None, apply_func_args=None, @@ -532,6 +533,8 @@ def broadcast_axis_partitions( then the number of splits is preserved. apply_indices : list of ints, default: None Indices of `axis ^ 1` to apply function over. + broadcast_all : bool, default: True + Whether or not to pass all right axis partitions to each of the left axis partitions. enumerate_partitions : bool, default: False Whether or not to pass partition index into `apply_func`. Note that `apply_func` must be able to accept `partition_idx` kwarg. @@ -578,7 +581,6 @@ def broadcast_axis_partitions( # load-balance the data as well. kw = { "num_splits": num_splits, - "other_axis_partition": right_partitions, "maintain_partitioning": keep_partitioning, } if lengths: @@ -593,6 +595,9 @@ def broadcast_axis_partitions( left_partitions[i].apply( preprocessed_map_func, *(apply_func_args if apply_func_args else []), + other_axis_partition=( + right_partitions if broadcast_all else right_partitions[i] + ), **kw, **({"partition_idx": idx} if enumerate_partitions else {}), **kwargs, @@ -648,6 +653,56 @@ def base_map_partitions( ] ) + @classmethod + @wait_computations_if_benchmark_mode + def broadcast_apply( + cls, + axis, + apply_func, + left, + right, + ): + """ + Broadcast the `right` partitions to `left` and apply `apply_func` function using different approaches to achieve the best performance. + + Parameters + ---------- + axis : {0, 1} + Axis to apply and broadcast over. + apply_func : callable + Function to apply. + left : np.ndarray + NumPy array of left partitions. + right : np.ndarray + NumPy array of right partitions. + + Returns + ------- + np.ndarray + NumPy array of result partition objects. + """ + if not DynamicPartitioning.get(): + # block-wise broadcast + new_partitions = cls.base_broadcast_apply( + axis, + apply_func, + left, + right, + ) + else: + # The dynamic partitioning behavior of `broadcast_apply` differs from that of `map_partitions`, + # since the columnar approach for `broadcast_apply` results in slowdown. + # axis-wise broadcast + new_partitions = cls.broadcast_axis_partitions( + axis=axis ^ 1, + left=left, + right=right, + apply_func=apply_func, + broadcast_all=False, + keep_partitioning=True, + ) + return new_partitions + @classmethod @wait_computations_if_benchmark_mode def map_partitions( diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 3581516a638..410bd2b50d8 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -3157,9 +3157,7 @@ def dropna(self, **kwargs): lib.no_default, None, ) - # FIXME: this is a naive workaround for this problem: https://github.com/modin-project/modin/issues/5394 - # if there are too many partitions then all non-full-axis implementations start acting very badly. - # The here threshold is pretty random though it works fine on simple scenarios + # The map reduce approach works well for frames with few columnar partitions processable_amount_of_partitions = ( self._modin_frame.num_parts < CpuCount.get() * 32 ) diff --git a/modin/tests/pandas/test_groupby.py b/modin/tests/pandas/test_groupby.py index b82473c674b..36987c0d931 100644 --- a/modin/tests/pandas/test_groupby.py +++ b/modin/tests/pandas/test_groupby.py @@ -21,7 +21,13 @@ import pytest import modin.pandas as pd -from modin.config import IsRayCluster, NPartitions, RangePartitioning, StorageFormat +from modin.config import ( + IsRayCluster, + NPartitions, + RangePartitioning, + StorageFormat, + context, +) from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, @@ -2431,6 +2437,31 @@ def test_multi_column_groupby_different_partitions( ) +def test_empty_partitions_after_groupby(): + def func_to_apply(grp): + return grp.agg( + { + list(test_data_values[0].keys())[1]: "sum", + list(test_data_values[0].keys())[-1]: "sum", + } + ) + + data = test_data_values[0] + md_df, pd_df = create_test_dfs(data) + by = pd_df.columns[0] + + with context(DynamicPartitioning=True): + md_grp, pd_grp = ( + md_df.groupby(by), + pd_df.groupby(by), + ) + eval_general( + md_grp, + pd_grp, + func_to_apply, + ) + + @pytest.mark.parametrize( "by", [ From f70176a796db92a5484ae4d3530906bdc3f5eb70 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 28 Aug 2024 03:32:28 -0700 Subject: [PATCH 13/20] FIX-#7371: Fix inserting datelike values into a DataFrame (#7372) Signed-off-by: Jonathan Shi --- modin/core/dataframe/pandas/metadata/dtypes.py | 2 +- modin/tests/pandas/dataframe/test_map_metadata.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index 1918cce16fa..9220a2dace4 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -1225,7 +1225,7 @@ def extract_dtype(value) -> DtypeObj | pandas.Series: """ try: dtype = pandas.api.types.pandas_dtype(value) - except TypeError: + except (TypeError, ValueError): dtype = pandas.Series(value).dtype return dtype diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 07b195bdafa..fc9c3b76ea7 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -1837,3 +1837,18 @@ def test_constructor_from_index(): data = pd.Index([1, 2, 3], name="pricing_date") modin_df, pandas_df = create_test_dfs(data) df_equals(modin_df, pandas_df) + + +def test_insert_datelike_string_issue_7371(): + # When a new value is inserted into a frame, we call pandas.api.types.pandas_dtype(value) to + # extract the dtype of an object like a pandas Series or numpy array. When a scalar value is passed, + # this usually raises a TypeError, so we construct a local pandas Series from the object and + # extract the dtype from there. + # When the passed value is a date-like string, pandas will instead raise a ValueError because + # it tries to parse it as a numpy structured dtype. After fixing GH#7371, we now catch + # ValueError in addition to TypeError to handle this case. + modin_df = pd.DataFrame({"a": [0]}) + modin_df["c"] = "2020-01-01" + pandas_df = pandas.DataFrame({"a": [0]}) + pandas_df["c"] = "2020-01-01" + df_equals(modin_df, pandas_df) From 5f4d40114ed89ef42116b272583fc2f6f700f72e Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Mon, 2 Sep 2024 01:29:24 -0700 Subject: [PATCH 14/20] FIX-#7379: Fix __imul__ performing addition instead of multiplication (#7380) Signed-off-by: Jonathan Shi --- modin/pandas/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 76470ab243c..d18a0bec778 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -517,7 +517,7 @@ def __rtruediv__(self, left) -> Series: return self.rtruediv(left) __iadd__ = __add__ - __imul__ = __add__ + __imul__ = __mul__ __ipow__ = __pow__ __isub__ = __sub__ __itruediv__ = __truediv__ From cf5d638ec7a69d2d851a7d43f23c96640eaab9dd Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:29:23 +0200 Subject: [PATCH 15/20] FEAT-#7308: Interoperability between query compilers (#7376) Co-authored-by: Anatoly Myachev Co-authored-by: Igoshev, Iaroslav Signed-off-by: arunjose696 --- .github/workflows/ci.yml | 8 + .../pandas/native_query_compiler.py | 5 +- .../storage_formats/pandas/query_compiler.py | 3 +- .../pandas/query_compiler_caster.py | 159 +++++ modin/pandas/dataframe.py | 5 +- modin/tests/pandas/native_df_mode/__init__.py | 12 + .../pandas/native_df_mode/test_binary.py | 198 ++++++ .../pandas/native_df_mode/test_default.py | 338 +++++++++ .../pandas/native_df_mode/test_indexing.py | 668 ++++++++++++++++++ .../tests/pandas/native_df_mode/test_iter.py | 137 ++++ .../pandas/native_df_mode/test_join_sort.py | 411 +++++++++++ .../native_df_mode/test_map_metadata.py | 258 +++++++ .../pandas/native_df_mode/test_pickle.py | 73 ++ .../pandas/native_df_mode/test_window.py | 101 +++ modin/tests/pandas/native_df_mode/utils.py | 133 ++++ 15 files changed, 2502 insertions(+), 7 deletions(-) create mode 100644 modin/core/storage_formats/pandas/query_compiler_caster.py create mode 100644 modin/tests/pandas/native_df_mode/__init__.py create mode 100644 modin/tests/pandas/native_df_mode/test_binary.py create mode 100644 modin/tests/pandas/native_df_mode/test_default.py create mode 100644 modin/tests/pandas/native_df_mode/test_indexing.py create mode 100644 modin/tests/pandas/native_df_mode/test_iter.py create mode 100644 modin/tests/pandas/native_df_mode/test_join_sort.py create mode 100644 modin/tests/pandas/native_df_mode/test_map_metadata.py create mode 100644 modin/tests/pandas/native_df_mode/test_pickle.py create mode 100644 modin/tests/pandas/native_df_mode/test_window.py create mode 100644 modin/tests/pandas/native_df_mode/utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9186500682a..8fb26225613 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -698,6 +698,14 @@ jobs: - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py - run: python -m pytest modin/tests/pandas/dataframe/test_window.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_binary.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_default.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_indexing.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_iter.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_join_sort.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_map_metadata.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_pickle.py + - run: python -m pytest modin/tests/pandas/native_df_mode/test_window.py - uses: ./.github/actions/upload-coverage merge-coverage-artifacts: diff --git a/modin/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py index bfe331cfc6e..12f9da6ef46 100644 --- a/modin/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -24,8 +24,8 @@ import pandas from pandas.core.dtypes.common import is_list_like, is_scalar -from modin.config.envvars import NativeDataframeMode from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.core.storage_formats.pandas.query_compiler_caster import QueryCompilerCaster from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, @@ -565,7 +565,7 @@ def caller(query_compiler, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class NativeQueryCompiler(BaseQueryCompiler): +class NativeQueryCompiler(BaseQueryCompiler, QueryCompilerCaster): """ Query compiler for the pandas storage format. @@ -585,7 +585,6 @@ class NativeQueryCompiler(BaseQueryCompiler): _shape_hint: Optional[str] def __init__(self, pandas_frame, shape_hint: Optional[str] = None): - assert NativeDataframeMode.get() == "Pandas" if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 410bd2b50d8..c7fb0bae21b 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -66,6 +66,7 @@ extract_dtype, ) from modin.core.storage_formats import BaseQueryCompiler +from modin.core.storage_formats.pandas.query_compiler_caster import QueryCompilerCaster from modin.error_message import ErrorMessage from modin.logging import get_logger from modin.utils import ( @@ -253,7 +254,7 @@ def caller(df, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class PandasQueryCompiler(BaseQueryCompiler): +class PandasQueryCompiler(BaseQueryCompiler, QueryCompilerCaster): """ Query compiler for the pandas storage format. diff --git a/modin/core/storage_formats/pandas/query_compiler_caster.py b/modin/core/storage_formats/pandas/query_compiler_caster.py new file mode 100644 index 00000000000..211860a8427 --- /dev/null +++ b/modin/core/storage_formats/pandas/query_compiler_caster.py @@ -0,0 +1,159 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +Module contains ``QueryCompilerCaster`` class. + +``QueryCompilerCaster`` is used for automatically casting query compiler +arguments to the type of the current query compiler for query compiler class functions. +This ensures compatibility between different query compiler classes. +""" + +import functools +import inspect +from types import FunctionType, MethodType +from typing import Any, Dict, Tuple, TypeVar + +from pandas.core.indexes.frozen import FrozenList + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler + +Fn = TypeVar("Fn", bound=Any) + + +class QueryCompilerCaster: + """Cast all query compiler arguments of the member function to current query compiler.""" + + @classmethod + def __init_subclass__( + cls, + **kwargs: Dict, + ) -> None: + """ + Apply type casting to all children of ``QueryCompilerCaster``. + + This method is called automatically when a class inherits from + ``QueryCompilerCaster``. It ensures that all member functions within the + subclass have their arguments automatically casted to the current query + compiler type. + + Parameters + ---------- + **kwargs : Additional keyword arguments + """ + super().__init_subclass__(**kwargs) + apply_argument_cast(cls) + + +def cast_nested_args_to_current_qc_type(arguments, current_qc): + """ + Cast all arguments in nested fashion to current query compiler. + + Parameters + ---------- + arguments : tuple or dict + current_qc : BaseQueryCompiler + + Returns + ------- + tuple or dict + Returns args and kwargs with all query compilers casted to current_qc. + """ + + def cast_arg_to_current_qc(arg): + current_qc_type = type(current_qc) + if isinstance(arg, BaseQueryCompiler) and not isinstance(arg, current_qc_type): + data_cls = current_qc._modin_frame + return current_qc_type.from_pandas(arg.to_pandas(), data_cls) + else: + return arg + + imutable_types = (FrozenList, tuple) + if isinstance(arguments, imutable_types): + args_type = type(arguments) + arguments = list(arguments) + arguments = cast_nested_args_to_current_qc_type(arguments, current_qc) + + return args_type(arguments) + if isinstance(arguments, list): + for i in range(len(arguments)): + if isinstance(arguments[i], (list, dict)): + cast_nested_args_to_current_qc_type(arguments[i], current_qc) + else: + arguments[i] = cast_arg_to_current_qc(arguments[i]) + elif isinstance(arguments, dict): + for key in arguments: + if isinstance(arguments[key], (list, dict)): + cast_nested_args_to_current_qc_type(arguments[key], current_qc) + else: + arguments[key] = cast_arg_to_current_qc(arguments[key]) + return arguments + + +def apply_argument_cast(obj: Fn) -> Fn: + """ + Cast all arguments that are query compilers to the current query compiler. + + Parameters + ---------- + obj : function + + Returns + ------- + function + Returns decorated function which does argument casting. + """ + if isinstance(obj, type): + all_attrs = dict(inspect.getmembers(obj)) + all_attrs.pop("__abstractmethods__") + + # This is required because inspect converts class methods to member functions + current_class_attrs = vars(obj) + for key in current_class_attrs: + all_attrs[key] = current_class_attrs[key] + + for attr_name, attr_value in all_attrs.items(): + if isinstance( + attr_value, (FunctionType, MethodType, classmethod, staticmethod) + ): + wrapped = apply_argument_cast(attr_value) + setattr(obj, attr_name, wrapped) + return obj # type: ignore [return-value] + elif isinstance(obj, classmethod): + return classmethod(apply_argument_cast(obj.__func__)) # type: ignore [return-value, arg-type] + elif isinstance(obj, staticmethod): + return staticmethod(apply_argument_cast(obj.__func__)) + + @functools.wraps(obj) + def cast_args(*args: Tuple, **kwargs: Dict) -> Any: + """ + Add casting for query compiler arguments. + + Parameters + ---------- + *args : tuple + The function arguments. + **kwargs : dict + The function keyword arguments. + + Returns + ------- + Any + """ + current_qc = args[0] + if isinstance(current_qc, BaseQueryCompiler): + kwargs = cast_nested_args_to_current_qc_type(kwargs, current_qc) + args = cast_nested_args_to_current_qc_type(args, current_qc) + return obj(*args, **kwargs) + + return cast_args diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 3d97efb4af4..de96ea0ab26 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2993,9 +2993,8 @@ def _create_or_update_from_compiler( DataFrame or None None if update was done, ``DataFrame`` otherwise. """ - assert ( - isinstance(new_query_compiler, type(self._query_compiler)) - or type(new_query_compiler) in self._query_compiler.__class__.__bases__ + assert isinstance( + new_query_compiler, self._query_compiler.__class__.__bases__ ), "Invalid Query Compiler object: {}".format(type(new_query_compiler)) if not inplace: return self.__constructor__(query_compiler=new_query_compiler) diff --git a/modin/tests/pandas/native_df_mode/__init__.py b/modin/tests/pandas/native_df_mode/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/tests/pandas/native_df_mode/test_binary.py b/modin/tests/pandas/native_df_mode/test_binary.py new file mode 100644 index 00000000000..82c837b6416 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_binary.py @@ -0,0 +1,198 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import matplotlib +import pytest + +from modin.config import NativeDataframeMode, NPartitions +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + default_to_pandas_ignore_string, + df_equals, + test_data, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + + +@pytest.mark.parametrize( + "other", + [ + lambda df, axis: 4, + lambda df, axis: df.iloc[0] if axis == "columns" else list(df[df.columns[0]]), + lambda df, axis: { + label: idx + 1 + for idx, label in enumerate(df.axes[0 if axis == "rows" else 1]) + }, + lambda df, axis: { + label if idx % 2 else f"random_key{idx}": idx + 1 + for idx, label in enumerate(df.axes[0 if axis == "rows" else 1][::-1]) + }, + ], + ids=[ + "scalar", + "series_or_list", + "dictionary_keys_equal_columns", + "dictionary_keys_unequal_columns", + ], +) +@pytest.mark.parametrize("axis", ["rows", "columns"]) +@pytest.mark.parametrize( + "op", + [ + *("add", "radd", "sub", "rsub", "mod", "rmod", "pow", "rpow"), + *("truediv", "rtruediv", "mul", "rmul", "floordiv", "rfloordiv"), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +@pytest.mark.parametrize("backend", [None, "pyarrow"]) +def test_math_functions(other, axis, op, backend, df_mode_pair): + data = test_data["float_nan_data"] + if (op == "floordiv" or op == "rfloordiv") and axis == "rows": + # lambda == "series_or_list" + pytest.xfail(reason="different behavior") + + if op == "rmod" and axis == "rows": + # lambda == "series_or_list" + pytest.xfail(reason="different behavior") + + if op in ("mod", "rmod") and backend == "pyarrow": + pytest.skip(reason="These functions are not implemented in pandas itself") + + eval_general_interop( + data, + backend, + lambda df1, df2: getattr(df1, op)(other(df2, axis), axis=axis), + df_mode_pair, + ) + + +@pytest.mark.parametrize("other", [lambda df: 2, lambda df: df]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___divmod__(other, df_mode_pair): + data = test_data["float_nan_data"] + eval_general_interop( + data, None, lambda df1, df2: divmod(df1, other(df2)), df_mode_pair + ) + + +@pytest.mark.parametrize("other", ["as_left", 4]) +@pytest.mark.parametrize("op", ["eq", "ge", "gt", "le", "lt", "ne"]) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_comparison(data, op, other, request, df_mode_pair): + def operation(df1, df2): + return getattr(df1, op)(df2 if other == "as_left" else other) + + expected_exception = None + if "int_data" in request.node.callspec.id and other == "a": + pytest.xfail(reason="https://github.com/modin-project/modin/issues/7019") + elif "float_nan_data" in request.node.callspec.id and other == "a": + expected_exception = TypeError( + "Invalid comparison between dtype=float64 and str" + ) + eval_general_interop( + data, + None, + operation, + df_mode_pair, + expected_exception=expected_exception, + ) + + +@pytest.mark.parametrize( + "frame1_data,frame2_data,expected_pandas_equals", + [ + pytest.param({}, {}, True, id="two_empty_dataframes"), + pytest.param([[1]], [[0]], False, id="single_unequal_values"), + pytest.param([[None]], [[None]], True, id="single_none_values"), + pytest.param( + [[1, 2], [3, 4]], + [[1, 2], [3, 4]], + True, + id="equal_two_by_two_dataframes", + ), + pytest.param( + [[1, 2], [3, 4]], + [[5, 2], [3, 4]], + False, + id="unequal_two_by_two_dataframes", + ), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_equals(frame1_data, frame2_data, expected_pandas_equals, df_mode_pair): + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + frame1_data, df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + frame2_data, df_mode=df_mode_pair[1] + ) + + pandas_equals = pandas_df1.equals(pandas_df2) + assert pandas_equals == expected_pandas_equals, ( + "Test expected pandas to say the dataframes were" + + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + + f"{' not' if expected_pandas_equals else ''} equal." + ) + + assert modin_df1.equals(modin_df2) == pandas_equals + assert modin_df1.equals(pandas_df2) == pandas_equals + + +@pytest.mark.parametrize("empty_operand", ["right", "left", "both"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_empty_df(empty_operand, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + [0, 1, 2, 0, 1, 2], df_mode=df_mode_pair[0] + ) + modin_df_empty, pandas_df_empty = create_test_df_in_defined_mode( + df_mode=df_mode_pair[1] + ) + + if empty_operand == "right": + modin_res = modin_df + modin_df_empty + pandas_res = pandas_df + pandas_df_empty + elif empty_operand == "left": + modin_res = modin_df_empty + modin_df + pandas_res = pandas_df_empty + pandas_df + else: + modin_res = modin_df_empty + modin_df_empty + pandas_res = pandas_df_empty + pandas_df_empty + + df_equals(modin_res, pandas_res) diff --git a/modin/tests/pandas/native_df_mode/test_default.py b/modin/tests/pandas/native_df_mode/test_default.py new file mode 100644 index 00000000000..03d6d372fd4 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_default.py @@ -0,0 +1,338 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest +from numpy.testing import assert_array_equal + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.pandas.io import to_pandas +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + default_to_pandas_ignore_string, + df_equals, + test_data, + test_data_diff_dtype, + test_data_keys, + test_data_large_categorical_dataframe, + test_data_values, +) +from modin.tests.test_utils import warns_that_defaulting_to_pandas + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = [ + pytest.mark.filterwarnings(default_to_pandas_ignore_string), + # IGNORE FUTUREWARNINGS MARKS TO CLEANUP OUTPUT + pytest.mark.filterwarnings( + "ignore:.*bool is now deprecated and will be removed:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:first is deprecated and will be removed:FutureWarning" + ), + pytest.mark.filterwarnings( + "ignore:last is deprecated and will be removed:FutureWarning" + ), +] + + +@pytest.mark.parametrize( + "op, make_args", + [ + ("align", lambda df: {"other": df}), + ("corrwith", lambda df: {"other": df}), + ("ewm", lambda df: {"com": 0.5}), + ("from_dict", lambda df: {"data": None}), + ("from_records", lambda df: {"data": to_pandas(df)}), + ("hist", lambda df: {"column": "int_col"}), + ("interpolate", None), + ("mask", lambda df: {"cond": df != 0}), + ("pct_change", None), + ("to_xarray", None), + ("flags", None), + ("set_flags", lambda df: {"allows_duplicate_labels": False}), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_ops_defaulting_to_pandas(op, make_args, df_mode_pair): + modin_df1, _ = create_test_df_in_defined_mode( + test_data_diff_dtype, + post_fn=lambda df: df.drop(["str_col", "bool_col"], axis=1), + df_mode=df_mode_pair[0], + ) + modin_df2, _ = create_test_df_in_defined_mode( + test_data_diff_dtype, + post_fn=lambda df: df.drop(["str_col", "bool_col"], axis=1), + df_mode=df_mode_pair[1], + ) + with warns_that_defaulting_to_pandas(): + operation = getattr(modin_df1, op) + if make_args is not None: + operation(**make_args(modin_df2)) + else: + try: + operation() + # `except` for non callable attributes + except TypeError: + pass + + +@pytest.mark.parametrize( + "data", + test_data_values + [test_data_large_categorical_dataframe], + ids=test_data_keys + ["categorical_ints"], +) +def test_to_numpy(data): + modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) + assert_array_equal(modin_df.values, pandas_df.values) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_asfreq(df_mode_pair): + index = pd.date_range("1/1/2000", periods=4, freq="min") + series, _ = create_test_series_in_defined_mode( + [0.0, None, 2.0, 3.0], index=index, df_mode=df_mode_pair[0] + ) + df, _ = create_test_df_in_defined_mode({"s": series}, df_mode=df_mode_pair[1]) + with warns_that_defaulting_to_pandas(): + # We are only testing that this defaults to pandas, so we will just check for + # the warning + df.asfreq(freq="30S") + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_assign(df_mode_pair): + data = test_data_values[0] + + def assign_one_column(df1, df2): + df1.assign(new_column=pd.Series(df2.iloc[:, 0])) + + eval_general_interop(data, None, assign_one_column, df_mode_pair) + + def assign_multiple_columns(df1, df2): + df1.assign( + new_column=pd.Series(df2.iloc[:, 0]), new_column2=pd.Series(df2.iloc[:, 1]) + ) + + eval_general_interop(data, None, assign_multiple_columns, df_mode_pair) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_combine_first(df_mode_pair): + data1 = {"A": [None, 0], "B": [None, 4]} + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data1, df_mode=df_mode_pair[0] + ) + data2 = {"A": [1, 1], "B": [3, 3]} + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data2, df_mode=df_mode_pair[1] + ) + + df_equals( + modin_df1.combine_first(modin_df2), + pandas_df1.combine_first(pandas_df2), + # https://github.com/modin-project/modin/issues/5959 + check_dtypes=False, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_dot(data, df_mode_pair): + + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + col_len = len(modin_df.columns) + + # Test series input + modin_series, pandas_series = create_test_series_in_defined_mode( + np.arange(col_len), + index=pandas_df.columns, + df_mode=df_mode_pair[1], + ) + modin_result = modin_df.dot(modin_series) + pandas_result = pandas_df.dot(pandas_series) + df_equals(modin_result, pandas_result) + + def dot_func(df1, df2): + return df1.dot(df2.T) + + # modin_result = modin_df.dot(modin_df.T) + # pandas_result = pandas_df.dot(pandas_df.T) + # df_equals(modin_result, pandas_result) + # Test dataframe input + eval_general_interop(data, None, dot_func, df_mode_pair) + + # Test when input series index doesn't line up with columns + with pytest.raises(ValueError): + modin_series_without_index, _ = create_test_series_in_defined_mode( + np.arange(col_len), df_mode=df_mode_pair[1] + ) + modin_df.dot(modin_series_without_index) + + # Test case when left dataframe has size (n x 1) + # and right dataframe has size (1 x n) + eval_general_interop(pandas_series, None, dot_func, df_mode_pair) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_matmul(data, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + col_len = len(modin_df.columns) + + # Test list input + arr = np.arange(col_len) + modin_result = modin_df @ arr + pandas_result = pandas_df @ arr + df_equals(modin_result, pandas_result) + + # Test bad dimensions + with pytest.raises(ValueError): + modin_df @ np.arange(col_len + 10) + + # Test series input + modin_series, pandas_series = create_test_series_in_defined_mode( + np.arange(col_len), + index=pandas_df.columns, + df_mode=df_mode_pair[1], + ) + modin_result = modin_df @ modin_series + pandas_result = pandas_df @ pandas_series + df_equals(modin_result, pandas_result) + + # Test dataframe input + def matmul_func(df1, df2): + return df1 @ df2.T + + # Test dataframe input + eval_general_interop(data, None, matmul_func, df_mode_pair) + + # Test when input series index doesn't line up with columns + with pytest.raises(ValueError): + modin_series_without_index, _ = create_test_series_in_defined_mode( + np.arange(col_len), df_mode=df_mode_pair[1] + ) + modin_df @ modin_series_without_index + + +@pytest.mark.parametrize("data", [test_data["int_data"]], ids=["int_data"]) +@pytest.mark.parametrize( + "index", + [ + pytest.param(lambda _, df: df.columns[0], id="single_index_col"), + pytest.param( + lambda _, df: [*df.columns[0:2], *df.columns[-7:-4]], + id="multiple_index_cols", + ), + pytest.param(None, id="default_index"), + ], +) +@pytest.mark.parametrize( + "columns", + [ + pytest.param(lambda _, df: df.columns[len(df.columns) // 2], id="single_col"), + pytest.param( + lambda _, df: [ + *df.columns[(len(df.columns) // 2) : (len(df.columns) // 2 + 4)], + df.columns[-7], + ], + id="multiple_cols", + ), + pytest.param(None, id="default_columns"), + ], +) +@pytest.mark.parametrize( + "values", + [ + pytest.param(lambda _, df: df.columns[-1], id="single_value_col"), + pytest.param(lambda _, df: df.columns[-4:-1], id="multiple_value_cols"), + ], +) +@pytest.mark.parametrize( + "aggfunc", + [ + pytest.param(lambda df, _: np.mean(df), id="callable_tree_reduce_func"), + pytest.param("mean", id="tree_reduce_func"), + pytest.param("nunique", id="full_axis_func"), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_pivot_table_data(data, index, columns, values, aggfunc, request, df_mode_pair): + if ( + "callable_tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "callable_tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "tree_reduce_func-single_value_col-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "tree_reduce_func-multiple_value_cols-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "full_axis_func-single_value_col-multiple_cols-multiple_index_cols" + in request.node.callspec.id + or "full_axis_func-multiple_value_cols-multiple_cols-multiple_index_cols" + in request.node.callspec.id + ): + pytest.xfail(reason="https://github.com/modin-project/modin/issues/7011") + + expected_exception = None + if "default_columns-default_index" in request.node.callspec.id: + expected_exception = ValueError("No group keys passed!") + elif ( + "callable_tree_reduce_func" in request.node.callspec.id + and "int_data" in request.node.callspec.id + ): + expected_exception = TypeError("'numpy.float64' object is not callable") + + eval_general_interop( + data, + None, + operation=lambda df, _, *args, **kwargs: df.pivot_table( + *args, **kwargs + ).sort_index(axis=int(index is not None)), + df_mode_pair=df_mode_pair, + index=index, + columns=columns, + values=values, + aggfunc=aggfunc, + expected_exception=expected_exception, + ) diff --git a/modin/tests/pandas/native_df_mode/test_indexing.py b/modin/tests/pandas/native_df_mode/test_indexing.py new file mode 100644 index 00000000000..b434026394a --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_indexing.py @@ -0,0 +1,668 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + RAND_HIGH, + RAND_LOW, + default_to_pandas_ignore_string, + df_equals, + eval_general, + test_data, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +# TODO(https://github.com/modin-project/modin/issues/3655): catch all instances +# of defaulting to pandas. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + + +def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=None): + if loc is not None: + col = pd_df.columns[loc] + + value_getter = value if callable(value) else (lambda *args, **kwargs: value) + + eval_general( + md_df, + pd_df, + lambda df: df.__setitem__(col, value_getter(df)), + __inplace__=True, + expected_exception=expected_exception, + ) + df_mode_pair_list = list(product(NativeDataframeMode.choices, repeat=2)) + for df_mode_pair in df_mode_pair_list: + eval_general_interop( + pd_df, + None, + lambda df1, df2: df1.__setitem__(col, value_getter(df2)), + df_mode_pair, + __inplace__=True, + expected_exception=expected_exception, + ) + + +def eval_loc(md_df, pd_df, value, key): + if isinstance(value, tuple): + assert len(value) == 2 + # case when value for pandas different + md_value, pd_value = value + else: + md_value, pd_value = value, value + + eval_general( + md_df, + pd_df, + lambda df: df.loc.__setitem__( + key, pd_value if isinstance(df, pandas.DataFrame) else md_value + ), + __inplace__=True, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "key_func", + [ + # test for the case from https://github.com/modin-project/modin/issues/4308 + lambda df: "non_existing_column", + lambda df: df.columns[0], + lambda df: df.index, + lambda df: [df.index, df.columns[0]], + lambda df: ( + pandas.Series(list(range(len(df.index)))) + if isinstance(df, pandas.DataFrame) + else pd.Series(list(range(len(df)))) + ), + ], + ids=[ + "non_existing_column", + "first_column_name", + "original_index", + "list_of_index_and_first_column_name", + "series_of_integers", + ], +) +@pytest.mark.parametrize( + "drop_kwargs", + [{"drop": True}, {"drop": False}, {}], + ids=["drop_True", "drop_False", "no_drop_param"], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_set_index(data, key_func, drop_kwargs, request, df_mode_pair): + if ( + "list_of_index_and_first_column_name" in request.node.name + and "drop_False" in request.node.name + ): + pytest.xfail( + reason="KeyError: https://github.com/modin-project/modin/issues/5636" + ) + expected_exception = None + if "non_existing_column" in request.node.callspec.id: + expected_exception = KeyError( + "None of ['non_existing_column'] are in the columns" + ) + + eval_general_interop( + data, + None, + lambda df1, df2: df1.set_index(key_func(df2), **drop_kwargs), + expected_exception=expected_exception, + df_mode_pair=df_mode_pair, + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc(data, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + + indices = [i % 3 == 0 for i in range(len(modin_df.index))] + columns = [i % 5 == 0 for i in range(len(modin_df.columns))] + + # Key is a Modin or pandas series of booleans + series1, _ = create_test_series_in_defined_mode(indices, df_mode=df_mode_pair[0]) + series2, _ = create_test_series_in_defined_mode( + columns, index=modin_df.columns, df_mode=df_mode_pair[0] + ) + df_equals( + modin_df.loc[series1, series2], + pandas_df.loc[ + pandas.Series(indices), pandas.Series(columns, index=modin_df.columns) + ], + ) + + +@pytest.mark.parametrize("left, right", [(2, 1), (6, 1), (lambda df: 70, 1), (90, 70)]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc_insert_row(left, right, df_mode_pair): + # This test case comes from + # https://github.com/modin-project/modin/issues/3764 + data = [[1, 2, 3], [4, 5, 6]] + + def _test_loc_rows(df1, df2): + df1.loc[left] = df2.loc[right] + return df1 + + expected_exception = None + if right == 70: + pytest.xfail(reason="https://github.com/modin-project/modin/issues/7024") + + eval_general_interop( + data, + None, + _test_loc_rows, + expected_exception=expected_exception, + df_mode_pair=df_mode_pair, + ) + + +@pytest.fixture(params=list(product(NativeDataframeMode.choices, repeat=2))) +def loc_iter_dfs_interop(request): + df_mode_pair = request.param + columns = ["col1", "col2", "col3"] + index = ["row1", "row2", "row3"] + md_df1, pd_df1 = create_test_df_in_defined_mode( + {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, + columns=columns, + index=index, + df_mode=df_mode_pair[0], + ) + md_df2, pd_df2 = create_test_df_in_defined_mode( + {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, + columns=columns, + index=index, + df_mode=df_mode_pair[1], + ) + return md_df1, pd_df1, md_df2, pd_df2 + + +@pytest.mark.parametrize("reverse_order", [False, True]) +@pytest.mark.parametrize("axis", [0, 1]) +def test_loc_iter_assignment(loc_iter_dfs_interop, reverse_order, axis): + if reverse_order and axis: + pytest.xfail( + "Due to internal sorting of lookup values assignment order is lost, see GH-#2552" + ) + + md_df1, pd_df1, md_df2, pd_df2 = loc_iter_dfs_interop + + select = [slice(None), slice(None)] + select[axis] = sorted(pd_df1.axes[axis][:-1], reverse=reverse_order) + select = tuple(select) + + pd_df1.loc[select] = pd_df1.loc[select] + pd_df2.loc[select] + md_df1.loc[select] = md_df1.loc[select] + md_df2.loc[select] + df_equals(md_df1, pd_df1) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc_series(df_mode_pair): + md_df1, pd_df1 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, df_mode=df_mode_pair[0] + ) + md_df2, pd_df2 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, df_mode=df_mode_pair[1] + ) + + pd_df1.loc[pd_df2["a"] > 1, "b"] = np.log(pd_df1["b"]) + md_df1.loc[md_df2["a"] > 1, "b"] = np.log(md_df1["b"]) + + df_equals(pd_df1, md_df1) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_reindex_like(df_mode_pair): + o_data = [ + [24.3, 75.7, "high"], + [31, 87.8, "high"], + [22, 71.6, "medium"], + [35, 95, "medium"], + ] + o_columns = ["temp_celsius", "temp_fahrenheit", "windspeed"] + o_index = pd.date_range(start="2014-02-12", end="2014-02-15", freq="D") + new_data = [[28, "low"], [30, "low"], [35.1, "medium"]] + new_columns = ["temp_celsius", "windspeed"] + new_index = pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]) + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + o_data, + columns=o_columns, + index=o_index, + df_mode=df_mode_pair[0], + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + new_data, + columns=new_columns, + index=new_index, + df_mode=df_mode_pair[1], + ) + modin_result = modin_df2.reindex_like(modin_df1) + pandas_result = pandas_df2.reindex_like(pandas_df1) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_reindex_multiindex(df_mode_pair): + data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6) + index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"]) + pandas_midx = pandas.MultiIndex.from_product( + [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] + ) + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data=data1, index=index, columns=index, df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data=data2, index=pandas_midx, df_mode=df_mode_pair[1] + ) + + modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"] + md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index]) + pd_midx = pandas.MultiIndex.from_product( + [pandas_df2.index.levels[0], pandas_df1.index] + ) + # reindex without axis, index, or columns + modin_result = modin_df1.reindex(md_midx, fill_value=0) + pandas_result = pandas_df1.reindex(pd_midx, fill_value=0) + df_equals(modin_result, pandas_result) + # reindex with only axis + modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0) + pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0) + df_equals(modin_result, pandas_result) + # reindex with axis and level + modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0) + pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_getitem_empty_mask(df_mode_pair): + # modin-project/modin#517 + modin_frames = [] + pandas_frames = [] + data1 = np.random.randint(0, 100, size=(100, 4)) + mdf1, pdf1 = create_test_df_in_defined_mode( + data1, columns=list("ABCD"), df_mode=df_mode_pair[0] + ) + + modin_frames.append(mdf1) + pandas_frames.append(pdf1) + + data2 = np.random.randint(0, 100, size=(100, 4)) + mdf2, pdf2 = create_test_df_in_defined_mode( + data2, columns=list("ABCD"), df_mode=df_mode_pair[1] + ) + modin_frames.append(mdf2) + pandas_frames.append(pdf2) + + data3 = np.random.randint(0, 100, size=(100, 4)) + mdf3, pdf3 = create_test_df_in_defined_mode( + data3, columns=list("ABCD"), df_mode=df_mode_pair[0] + ) + modin_frames.append(mdf3) + pandas_frames.append(pdf3) + + modin_data = pd.concat(modin_frames) + pandas_data = pandas.concat(pandas_frames) + df_equals( + modin_data[[False for _ in modin_data.index]], + pandas_data[[False for _ in modin_data.index]], + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___setitem__mask(df_mode_pair): + # DataFrame mask: + data = test_data["int_data"] + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[0] + ) + + mean = int((RAND_HIGH + RAND_LOW) / 2) + pandas_df1[pandas_df2 > mean] = -50 + modin_df1[modin_df2 > mean] = -50 + + df_equals(modin_df1, pandas_df1) + + +@pytest.mark.parametrize( + "data", + [ + {}, + {"id": [], "max_speed": [], "health": []}, + {"id": [1], "max_speed": [2], "health": [3]}, + {"id": [4, 40, 400], "max_speed": [111, 222, 333], "health": [33, 22, 11]}, + ], + ids=["empty_frame", "empty_cols", "1_length_cols", "2_length_cols"], +) +@pytest.mark.parametrize( + "value", + [[11, 22], [11, 22, 33]], + ids=["2_length_val", "3_length_val"], +) +@pytest.mark.parametrize("convert_to_series", [False, True]) +@pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + + def applyier(df): + if convert_to_series: + converted_value = ( + pandas.Series(value) + if isinstance(df, pandas.DataFrame) + else create_test_series_in_defined_mode(value, df_mode=df_mode_pair[1])[ + 1 + ] + ) + else: + converted_value = value + df[new_col_id] = converted_value + return df + + expected_exception = None + if not convert_to_series: + values_length = len(value) + index_length = len(pandas_df.index) + expected_exception = ValueError( + f"Length of values ({values_length}) does not match length of index ({index_length})" + ) + + eval_general( + modin_df, + pandas_df, + applyier, + # https://github.com/modin-project/modin/issues/5961 + comparator_kwargs={ + "check_dtypes": not (len(pandas_df) == 0 and len(pandas_df.columns) != 0) + }, + expected_exception=expected_exception, + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_on_empty_df_4407(df_mode_pair): + data = {} + index = pd.date_range(end="1/1/2018", periods=0, freq="D") + column = pd.date_range(end="1/1/2018", periods=1, freq="h")[0] + modin_df, pandas_df = create_test_df_in_defined_mode( + data, columns=index, df_mode=df_mode_pair[0] + ) + modin_ser, pandas_ser = create_test_series_in_defined_mode( + [1], df_mode=df_mode_pair[1] + ) + modin_df[column] = modin_ser + pandas_df[column] = pandas_ser + + df_equals(modin_df, pandas_df) + assert modin_df.columns.freq == pandas_df.columns.freq + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_2d_insertion(df_mode_pair): + def build_value_picker(modin_value, pandas_value): + """Build a function that returns either Modin or pandas DataFrame depending on the passed frame.""" + return lambda source_df, *args, **kwargs: ( + modin_value + if isinstance(source_df, (pd.DataFrame, pd.Series)) + else pandas_value + ) + + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data["int_data"], df_mode=df_mode_pair[0] + ) + + # Easy case - key and value.columns are equal + modin_value, pandas_value = create_test_df_in_defined_mode( + { + "new_value1": np.arange(len(modin_df)), + "new_value2": np.arange(len(modin_df)), + }, + df_mode=df_mode_pair[1], + ) + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value, pandas_value), + col=["new_value1", "new_value2"], + ) + + # Key and value.columns have equal values but in different order + new_columns = ["new_value3", "new_value4"] + modin_value.columns, pandas_value.columns = new_columns, new_columns + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value, pandas_value), + col=["new_value4", "new_value3"], + ) + + # Key and value.columns have different values + new_columns = ["new_value5", "new_value6"] + modin_value.columns, pandas_value.columns = new_columns, new_columns + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value, pandas_value), + col=["__new_value5", "__new_value6"], + ) + + # Key and value.columns have different lengths, testing that both raise the same exception + eval_setitem( + modin_df, + pandas_df, + build_value_picker(modin_value.iloc[:, [0]], pandas_value.iloc[:, [0]]), + col=["new_value7", "new_value8"], + expected_exception=ValueError("Columns must be same length as key"), + ) + + +@pytest.mark.parametrize("does_value_have_different_columns", [True, False]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_setitem_2d_update(does_value_have_different_columns, df_mode_pair): + def test(dfs, iloc): + """Update columns on the given numeric indices.""" + df1, df2 = dfs + cols1 = df1.columns[iloc].tolist() + cols2 = df2.columns[iloc].tolist() + df1[cols1] = df2[cols2] + return df1 + + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data["int_data"], df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + test_data["int_data"], df_mode=df_mode_pair[1] + ) + modin_df2 *= 10 + pandas_df2 *= 10 + + if does_value_have_different_columns: + new_columns = [f"{col}_new" for col in modin_df.columns] + modin_df2.columns = new_columns + pandas_df2.columns = new_columns + + modin_dfs = (modin_df, modin_df2) + pandas_dfs = (pandas_df, pandas_df2) + + eval_general(modin_dfs, pandas_dfs, test, iloc=[0, 1, 2]) + eval_general(modin_dfs, pandas_dfs, test, iloc=[0, -1]) + eval_general( + modin_dfs, pandas_dfs, test, iloc=slice(1, None) + ) # (start=1, stop=None) + eval_general( + modin_dfs, pandas_dfs, test, iloc=slice(None, -2) + ) # (start=None, stop=-2) + eval_general( + modin_dfs, + pandas_dfs, + test, + iloc=[0, 1, 5, 6, 9, 10, -2, -1], + ) + eval_general( + modin_dfs, + pandas_dfs, + test, + iloc=[5, 4, 0, 10, 1, -1], + ) + eval_general( + modin_dfs, pandas_dfs, test, iloc=slice(None, None, 2) + ) # (start=None, stop=None, step=2) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___setitem__single_item_in_series(df_mode_pair): + # Test assigning a single item in a Series for issue + # https://github.com/modin-project/modin/issues/3860 + modin_series1, pandas_series1 = create_test_series_in_defined_mode( + 99, df_mode=df_mode_pair[0] + ) + modin_series2, pandas_series2 = create_test_series_in_defined_mode( + 100, df_mode=df_mode_pair[1] + ) + modin_series1[:1] = modin_series2 + pandas_series1[:1] = pandas_series2 + df_equals(modin_series1, pandas_series1) + + +@pytest.mark.parametrize( + "value", + [ + 1, + np.int32(1), + 1.0, + "str val", + pandas.Timestamp("1/4/2018"), + np.datetime64(0, "ms"), + True, + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_loc_boolean_assignment_scalar_dtypes(value, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + { + "a": [1, 2, 3], + "b": [3.0, 5.0, 6.0], + "c": ["a", "b", "c"], + "d": [1.0, "c", 2.0], + "e": pandas.to_datetime(["1/1/2018", "1/2/2018", "1/3/2018"]), + "f": [True, False, True], + }, + df_mode=df_mode_pair[1], + ) + modin_idx, pandas_idx = create_test_series_in_defined_mode( + [False, True, True], df_mode=df_mode_pair[1] + ) + + modin_df.loc[modin_idx] = value + pandas_df.loc[pandas_idx] = value + df_equals(modin_df, pandas_df) + + +# This is a very subtle bug that comes from: +# https://github.com/modin-project/modin/issues/4945 +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_lazy_eval_index(df_mode_pair): + data = {"col0": [0, 1]} + + def func(df1, df2): + df_copy = df1[df2["col0"] < 6].copy() + # The problem here is that the index is not copied over so it needs + # to get recomputed at some point. Our implementation of __setitem__ + # requires us to build a mask and insert the value from the right + # handside into the new DataFrame. However, it's possible that we + # won't have any new partitions, so we will end up computing an empty + # index. + df_copy["col0"] = df_copy["col0"].apply(lambda x: x + 1) + return df_copy + + eval_general_interop(data, None, func, df_mode_pair=df_mode_pair) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_index_of_empty_frame(df_mode_pair): + # Test on an empty frame created by user + + # Test on an empty frame produced by Modin's logic + data = test_data_values[0] + md_df1, pd_df1 = create_test_df_in_defined_mode( + data, + index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name"), + df_mode=df_mode_pair[0], + ) + md_df2, pd_df2 = create_test_df_in_defined_mode( + data, + index=pandas.RangeIndex(len(next(iter(data.values()))), name="index name"), + df_mode=df_mode_pair[1], + ) + + md_res = md_df1.query(f"{md_df2.columns[0]} > {RAND_HIGH}") + pd_res = pd_df1.query(f"{pd_df2.columns[0]} > {RAND_HIGH}") + + assert md_res.empty and pd_res.empty + df_equals(md_res.index, pd_res.index) diff --git a/modin/tests/pandas/native_df_mode/test_iter.py b/modin/tests/pandas/native_df_mode/test_iter.py new file mode 100644 index 00000000000..a2e176d4372 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_iter.py @@ -0,0 +1,137 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import warnings +from itertools import product + +import matplotlib +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.pandas.utils import SET_DATAFRAME_ATTRIBUTE_WARNING +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, +) +from modin.tests.pandas.utils import df_equals, eval_general + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test___setattr__mutating_column(df_mode_pair): + # Use case from issue #4577 + modin_df, pandas_df = create_test_df_in_defined_mode( + [[1]], columns=["col0"], df_mode=df_mode_pair[0] + ) + # Replacing a column with a list should mutate the column in place. + pandas_df.col0 = [3] + modin_df.col0 = [3] + modin_ser, pandas_ser = create_test_series_in_defined_mode( + [3], df_mode=df_mode_pair[1] + ) + df_equals(modin_df, pandas_df) + # Check that the col0 attribute reflects the value update. + df_equals(modin_df.col0, pandas_df.col0) + + pandas_df.col0 = pandas_ser + modin_df.col0 = modin_ser + + # Check that the col0 attribute reflects this update + df_equals(modin_df, pandas_df) + + pandas_df.loc[0, "col0"] = 4 + modin_df.loc[0, "col0"] = 4 + + # Check that the col0 attribute reflects update via loc + df_equals(modin_df, pandas_df) + assert modin_df.col0.equals(modin_df["col0"]) + + # Check that attempting to add a new col via attributes raises warning + # and adds the provided list as a new attribute and not a column. + with pytest.warns( + UserWarning, + match=SET_DATAFRAME_ATTRIBUTE_WARNING, + ): + modin_df.col1 = [4] + + with warnings.catch_warnings(): + warnings.filterwarnings( + action="error", + message=SET_DATAFRAME_ATTRIBUTE_WARNING, + ) + modin_df.col1 = [5] + modin_df.new_attr = 6 + modin_df.col0 = 7 + + assert "new_attr" in dir( + modin_df + ), "Modin attribute was not correctly added to the df." + assert ( + "new_attr" not in modin_df + ), "New attribute was not correctly added to columns." + assert modin_df.new_attr == 6, "Modin attribute value was set incorrectly." + assert isinstance( + modin_df.col0, pd.Series + ), "Scalar was not broadcasted properly to an existing column." + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_isin_with_modin_objects(df_mode_pair): + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, df_mode=df_mode_pair[0] + ) + modin_series, pandas_series = create_test_series_in_defined_mode( + [1, 4, 5, 6], df_mode=df_mode_pair[1] + ) + + eval_general( + (modin_df1, modin_series), + (pandas_df1, pandas_series), + lambda srs: srs[0].isin(srs[1]), + ) + + modin_df2 = modin_series.to_frame("a") + pandas_df2 = pandas_series.to_frame("a") + + eval_general( + (modin_df1, modin_df2), + (pandas_df1, pandas_df2), + lambda srs: srs[0].isin(srs[1]), + ) + + # Check case when indices are not matching + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + {"a": [1, 2], "b": [3, 4]}, + index=[10, 11], + df_mode=df_mode_pair[0], + ) + + eval_general( + (modin_df1, modin_series), + (pandas_df1, pandas_series), + lambda srs: srs[0].isin(srs[1]), + ) + eval_general( + (modin_df1, modin_df2), + (pandas_df1, pandas_df2), + lambda srs: srs[0].isin(srs[1]), + ) diff --git a/modin/tests/pandas/native_df_mode/test_join_sort.py b/modin/tests/pandas/native_df_mode/test_join_sort.py new file mode 100644 index 00000000000..62565dde382 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_join_sort.py @@ -0,0 +1,411 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.pandas.io import to_pandas +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, + eval_general_interop, +) +from modin.tests.pandas.utils import ( + default_to_pandas_ignore_string, + df_equals, + eval_general, + random_state, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + +# Initialize env for storage format detection in @pytest.mark.* +pd.DataFrame() + + +def df_equals_and_sort(df1, df2): + """Sort dataframe's rows and run ``df_equals()`` for them.""" + df1 = df1.sort_values(by=df1.columns.tolist(), ignore_index=True) + df2 = df2.sort_values(by=df2.columns.tolist(), ignore_index=True) + df_equals(df1, df2) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_combine(data, df_mode_pair): + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + data, df_mode=df_mode_pair[1] + ) + modin_df_1.combine( + modin_df_2 + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 + ) + pandas_df_1.combine( + pandas_df_2 + 1, lambda s1, s2: s1 if s1.count() < s2.count() else s2 + ) + + +@pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(128, 64)), + ), + ( + np.random.randint(0, 100, size=(128, 64)), + np.random.randint(0, 100, size=(64, 64)), + ), + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(64, 128)), + ), + ( + np.random.randint(0, 100, size=(64, 128)), + np.random.randint(0, 100, size=(64, 64)), + ), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_join(test_data, test_data2, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[0], + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[1], + ) + + hows = ["inner", "left", "right", "outer"] + ons = ["col33", "col34"] + sorts = [False, True] + assert len(ons) == len(sorts), "the loop below is designed for this condition" + for i in range(len(hows)): + for j in range(len(ons)): + modin_result = modin_df.join( + modin_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + pandas_result = pandas_df.join( + pandas_df2, + how=hows[i], + on=ons[j], + sort=sorts[j], + lsuffix="_caller", + rsuffix="_other", + ) + if sorts[j]: + # sorting in `join` is implemented through range partitioning technique + # therefore the order of the rows after it does not match the pandas, + # so additional sorting is needed in order to get the same result as for pandas + df_equals_and_sort(modin_result, pandas_result) + else: + df_equals(modin_result, pandas_result) + + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], + "col4": [2, 4, 5, 6], + } + + modin_df = pd.DataFrame(frame_data) + pandas_df = pandas.DataFrame(frame_data) + + frame_data2 = {"col5": [0], "col6": [1]} + modin_df2 = pd.DataFrame(frame_data2) + pandas_df2 = pandas.DataFrame(frame_data2) + + join_types = ["left", "right", "outer", "inner"] + for how in join_types: + modin_join = modin_df.join(modin_df2, how=how) + pandas_join = pandas_df.join(pandas_df2, how=how) + df_equals(modin_join, pandas_join) + + frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} + + modin_df3 = pd.DataFrame(frame_data3) + pandas_df3 = pandas.DataFrame(frame_data3) + + join_types = ["left", "outer", "inner"] + for how in join_types: + modin_join = modin_df.join([modin_df2, modin_df3], how=how) + pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) + df_equals(modin_join, pandas_join) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_join_cross_6786(df_mode_pair): + data = [[7, 8, 9], [10, 11, 12]] + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + data, columns=["x", "y", "z"], df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + data, columns=["x", "y", "z"], df_mode=df_mode_pair[1] + ) + modin_join = modin_df_1.join( + modin_df_2[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" + ) + pandas_join = pandas_df_1.join( + pandas_df_2[["x"]].set_axis(["p", "q"], axis=0), how="cross", lsuffix="p" + ) + df_equals(modin_join, pandas_join) + + +@pytest.mark.parametrize( + "test_data, test_data2", + [ + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(128, 64)), + ), + ( + np.random.randint(0, 100, size=(128, 64)), + np.random.randint(0, 100, size=(64, 64)), + ), + ( + np.random.randint(0, 100, size=(64, 64)), + np.random.randint(0, 100, size=(64, 128)), + ), + ( + np.random.randint(0, 100, size=(64, 128)), + np.random.randint(0, 100, size=(64, 64)), + ), + ], +) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_merge(test_data, test_data2, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data, + columns=["col{}".format(i) for i in range(test_data.shape[1])], + index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[0], + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + test_data2, + columns=["col{}".format(i) for i in range(test_data2.shape[1])], + index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), + df_mode=df_mode_pair[1], + ) + hows = ["left", "inner", "right"] + ons = ["col33", ["col33", "col34"]] + sorts = [False, True] + assert len(ons) == len(sorts), "the loop below is designed for this condition" + for i in range(len(hows)): + for j in range(len(ons)): + modin_result = modin_df.merge( + modin_df2, how=hows[i], on=ons[j], sort=sorts[j] + ) + pandas_result = pandas_df.merge( + pandas_df2, how=hows[i], on=ons[j], sort=sorts[j] + ) + # FIXME: https://github.com/modin-project/modin/issues/2246 + df_equals_and_sort(modin_result, pandas_result) + + modin_result = modin_df.merge( + modin_df2, + how=hows[i], + left_on="key", + right_on="key", + sort=sorts[j], + ) + pandas_result = pandas_df.merge( + pandas_df2, + how=hows[i], + left_on="key", + right_on="key", + sort=sorts[j], + ) + # FIXME: https://github.com/modin-project/modin/issues/2246 + df_equals_and_sort(modin_result, pandas_result) + + +@pytest.mark.parametrize("how", ["left", "inner", "right"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_merge_empty( + how, + df_mode_pair, +): + data = np.random.randint(0, 100, size=(64, 64)) + eval_general_interop( + data, + None, + lambda df1, df2: df1.merge(df2.iloc[:0], how=how), + df_mode_pair, + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_merge_with_mi_columns(df_mode_pair): + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + { + ("col0", "a"): [1, 2, 3, 4], + ("col0", "b"): [2, 3, 4, 5], + ("col1", "a"): [3, 4, 5, 6], + }, + df_mode=df_mode_pair[0], + ) + + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + { + ("col0", "a"): [1, 2, 3, 4], + ("col0", "c"): [2, 3, 4, 5], + ("col1", "a"): [3, 4, 5, 6], + }, + df_mode=df_mode_pair[1], + ) + + eval_general( + (modin_df1, modin_df2), + (pandas_df1, pandas_df2), + lambda dfs: dfs[0].merge(dfs[1], on=[("col0", "a")]), + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_where(df_mode_pair): + columns = list("abcdefghij") + + frame_data = random_state.randn(100, 10) + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + frame_data, columns=columns, df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + frame_data, columns=columns, df_mode=df_mode_pair[1] + ) + pandas_cond_df = pandas_df_2 % 5 < 2 + modin_cond_df = modin_df_2 % 5 < 2 + + pandas_result = pandas_df_1.where(pandas_cond_df, -pandas_df_2) + modin_result = modin_df_1.where(modin_cond_df, -modin_df_2) + assert all((to_pandas(modin_result) == pandas_result).all()) + + # test case when other is Series + other_data = random_state.randn(len(pandas_df_1)) + modin_other, pandas_other = create_test_series_in_defined_mode( + other_data, df_mode=df_mode_pair[0] + ) + pandas_result = pandas_df_1.where(pandas_cond_df, pandas_other, axis=0) + modin_result = modin_df_1.where(modin_cond_df, modin_other, axis=0) + df_equals(modin_result, pandas_result) + + # Test that we choose the right values to replace when `other` == `True` + # everywhere. + other_data = np.full(shape=pandas_df_1.shape, fill_value=True) + modin_other, pandas_other = create_test_df_in_defined_mode( + other_data, columns=columns, df_mode=df_mode_pair[0] + ) + pandas_result = pandas_df_1.where(pandas_cond_df, pandas_other) + modin_result = modin_df_1.where(modin_cond_df, modin_other) + df_equals(modin_result, pandas_result) + + other = pandas_df_1.loc[3] + pandas_result = pandas_df_1.where(pandas_cond_df, other, axis=1) + modin_result = modin_df_1.where(modin_cond_df, other, axis=1) + assert all((to_pandas(modin_result) == pandas_result).all()) + + other = pandas_df_1["e"] + pandas_result = pandas_df_1.where(pandas_cond_df, other, axis=0) + modin_result = modin_df_1.where(modin_cond_df, other, axis=0) + assert all((to_pandas(modin_result) == pandas_result).all()) + + pandas_result = pandas_df_1.where(pandas_df_2 < 2, True) + modin_result = modin_df_1.where(modin_df_2 < 2, True) + assert all((to_pandas(modin_result) == pandas_result).all()) + + +@pytest.mark.parametrize("align_axis", ["index", "columns"]) +@pytest.mark.parametrize("keep_shape", [False, True]) +@pytest.mark.parametrize("keep_equal", [False, True]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_compare(align_axis, keep_shape, keep_equal, df_mode_pair): + kwargs = { + "align_axis": align_axis, + "keep_shape": keep_shape, + "keep_equal": keep_equal, + } + frame_data1 = random_state.randn(100, 10) + frame_data2 = random_state.randn(100, 10) + modin_df, pandas_df = create_test_df_in_defined_mode( + frame_data1, columns=list("abcdefghij"), df_mode=df_mode_pair[0] + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + frame_data2, columns=list("abcdefghij"), df_mode=df_mode_pair[0] + ) + modin_result = modin_df.compare(modin_df2, **kwargs) + pandas_result = pandas_df.compare(pandas_df2, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) + + modin_result = modin_df2.compare(modin_df, **kwargs) + pandas_result = pandas_df2.compare(pandas_df, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) + + series_data1 = ["a", "b", "c", "d", "e"] + series_data2 = ["a", "a", "c", "b", "e"] + modin_series1, pandas_series1 = create_test_series_in_defined_mode( + series_data1, df_mode=df_mode_pair[0] + ) + modin_series2, pandas_series2 = create_test_series_in_defined_mode( + series_data2, df_mode=df_mode_pair[1] + ) + + modin_result = modin_series1.compare(modin_series2, **kwargs) + pandas_result = pandas_series1.compare(pandas_series2, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) + + modin_result = modin_series2.compare(modin_series1, **kwargs) + pandas_result = pandas_series2.compare(pandas_series1, **kwargs) + assert to_pandas(modin_result).equals(pandas_result) diff --git a/modin/tests/pandas/native_df_mode/test_map_metadata.py b/modin/tests/pandas/native_df_mode/test_map_metadata.py new file mode 100644 index 00000000000..e9e460ffbc8 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_map_metadata.py @@ -0,0 +1,258 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions, StorageFormat +from modin.tests.pandas.native_df_mode.utils import ( + create_test_df_in_defined_mode, + create_test_series_in_defined_mode, +) +from modin.tests.pandas.utils import ( + RAND_HIGH, + RAND_LOW, + axis_keys, + axis_values, + default_to_pandas_ignore_string, + df_equals, + eval_general, + name_contains, + numeric_dfs, + random_state, + test_data, + test_data_keys, + test_data_values, +) + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +# Our configuration in pytest.ini requires that we explicitly catch all +# instances of defaulting to pandas, but some test modules, like this one, +# have too many such instances. +pytestmark = pytest.mark.filterwarnings(default_to_pandas_ignore_string) + + +def eval_insert(modin_df, pandas_df, **kwargs): + if "col" in kwargs and "column" not in kwargs: + kwargs["column"] = kwargs.pop("col") + _kwargs = {"loc": 0, "column": "New column"} + _kwargs.update(kwargs) + + eval_general( + modin_df, + pandas_df, + operation=lambda df, **kwargs: df.insert(**kwargs), + __inplace__=True, + **_kwargs, + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_empty_df(df_mode_pair): + modin_df, pd_df = create_test_df_in_defined_mode(None, df_mode=df_mode_pair[0]) + md_series, pd_series = create_test_series_in_defined_mode( + [1, 2, 3, 4, 5], df_mode=df_mode_pair[1] + ) + modin_df["a"] = md_series + pd_df["a"] = pd_series + df_equals(modin_df, pd_df) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_astype(df_mode_pair): + td = pandas.DataFrame(test_data["int_data"])[["col1", "index", "col3", "col4"]] + modin_df, pandas_df = create_test_df_in_defined_mode( + td.values, + index=td.index, + columns=td.columns, + df_mode=df_mode_pair[0], + ) + + def astype_func(df): + md_ser, pd_ser = create_test_series_in_defined_mode( + [str, str], index=["col1", "col1"], df_mode=df_mode_pair[1] + ) + if isinstance(df, pd.DataFrame): + return df.astype(md_ser) + else: + return df.astype(pd_ser) + + # The dtypes series must have a unique index. + eval_general( + modin_df, + pandas_df, + astype_func, + expected_exception=ValueError( + "cannot reindex on an axis with duplicate labels" + ), + ) + + +########################################################################### + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_convert_dtypes_5653(df_mode_pair): + modin_part1, _ = create_test_df_in_defined_mode( + {"col1": ["a", "b", "c", "d"]}, df_mode=df_mode_pair[0] + ) + modin_part2, _ = create_test_df_in_defined_mode( + {"col1": [None, None, None, None]}, df_mode=df_mode_pair[1] + ) + modin_df = pd.concat([modin_part1, modin_part2]) + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": + assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) + modin_df = modin_df.convert_dtypes() + assert len(modin_df.dtypes) == 1 + assert modin_df.dtypes.iloc[0] == "string" + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("axis", axis_values, ids=axis_keys) +@pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) +@pytest.mark.exclude_in_sanity +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_clip(request, data, axis, bound_type, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + + if name_contains(request.node.name, numeric_dfs): + ind_len = ( + len(modin_df.index) + if not pandas.DataFrame()._get_axis_number(axis) + else len(modin_df.columns) + ) + + lower = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) + upper = random_state.randint(RAND_LOW, RAND_HIGH, ind_len) + + if bound_type == "series": + modin_lower, pandas_lower = create_test_series_in_defined_mode( + lower, df_mode=df_mode_pair[1] + ) + modin_upper, pandas_upper = create_test_series_in_defined_mode( + upper, df_mode=df_mode_pair[0] + ) + else: + modin_lower = pandas_lower = lower + modin_upper = pandas_upper = upper + + # test lower and upper list bound on each column + modin_result = modin_df.clip(modin_lower, modin_upper, axis=axis) + pandas_result = pandas_df.clip(pandas_lower, pandas_upper, axis=axis) + df_equals(modin_result, pandas_result) + + # test only upper list bound on each column + modin_result = modin_df.clip(np.nan, modin_upper, axis=axis) + pandas_result = pandas_df.clip(np.nan, pandas_upper, axis=axis) + df_equals(modin_result, pandas_result) + + with pytest.raises(ValueError): + modin_df.clip(lower=[1, 2, 3], axis=None) + + +@pytest.mark.parametrize( + "data, other_data", + [ + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), + ( + {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, + {"B": ["d", "e", "f", "g", "h", "i"]}, + ), + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, np.nan, 6]}), + ], +) +@pytest.mark.parametrize("errors", ["raise", "ignore"]) +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_update(data, other_data, errors, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode(data, df_mode=df_mode_pair[0]) + other_modin_df, other_pandas_df = create_test_df_in_defined_mode( + other_data, df_mode=df_mode_pair[1] + ) + expected_exception = None + if errors == "raise": + expected_exception = ValueError("Data overlaps.") + eval_general( + modin_df, + pandas_df, + lambda df: ( + df.update(other_modin_df, errors=errors) + if isinstance(df, pd.DataFrame) + else df.update(other_pandas_df, errors=errors) + ), + __inplace__=True, + expected_exception=expected_exception, + ) + + +@pytest.mark.parametrize( + "get_index", + [ + pytest.param(lambda idx: None, id="None_idx"), + pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), + pytest.param(lambda idx: idx, id="Equal_idx"), + pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), + ], +) +@pytest.mark.parametrize( + "get_columns", + [ + pytest.param(lambda idx: None, id="None_idx"), + pytest.param(lambda idx: ["a", "b", "c"], id="No_intersection_idx"), + pytest.param(lambda idx: idx, id="Equal_idx"), + pytest.param(lambda idx: idx[::-1], id="Reversed_idx"), + ], +) +@pytest.mark.parametrize("dtype", [None, "str"]) +@pytest.mark.exclude_in_sanity +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_constructor_from_modin_series(get_index, get_columns, dtype, df_mode_pair): + modin_df, pandas_df = create_test_df_in_defined_mode( + test_data_values[0], df_mode=df_mode_pair[0] + ) + + modin_data = {f"new_col{i}": modin_df.iloc[:, i] for i in range(modin_df.shape[1])} + pandas_data = { + f"new_col{i}": pandas_df.iloc[:, i] for i in range(pandas_df.shape[1]) + } + + index = get_index(modin_df.index) + columns = get_columns(list(modin_data.keys())) + + new_modin = pd.DataFrame(modin_data, index=index, columns=columns, dtype=dtype) + new_pandas = pandas.DataFrame( + pandas_data, index=index, columns=columns, dtype=dtype + ) + df_equals(new_modin, new_pandas) diff --git a/modin/tests/pandas/native_df_mode/test_pickle.py b/modin/tests/pandas/native_df_mode/test_pickle.py new file mode 100644 index 00000000000..cf9b4dfcb9c --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_pickle.py @@ -0,0 +1,73 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import numpy as np +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, PersistentPickle +from modin.tests.pandas.native_df_mode.utils import create_test_df_in_defined_mode +from modin.tests.pandas.utils import df_equals + + +@pytest.fixture +def modin_df(): + return pd.DataFrame({"col1": np.arange(1000), "col2": np.arange(2000, 3000)}) + + +@pytest.fixture +def modin_column(modin_df): + return modin_df["col1"] + + +@pytest.fixture(params=[True, False]) +def persistent(request): + old = PersistentPickle.get() + PersistentPickle.put(request.param) + yield request.param + PersistentPickle.put(old) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test__reduce__(df_mode_pair): + # `DataFrame.__reduce__` will be called implicitly when lambda expressions are + # pre-processed for the distributed engine. + dataframe_data = ["Major League Baseball", "National Basketball Association"] + abbr_md, abbr_pd = create_test_df_in_defined_mode( + dataframe_data, index=["MLB", "NBA"], df_mode=df_mode_pair[0] + ) + + dataframe_data = { + "name": ["Mariners", "Lakers"] * 500, + "league_abbreviation": ["MLB", "NBA"] * 500, + } + teams_md, teams_pd = create_test_df_in_defined_mode( + dataframe_data, df_mode=df_mode_pair[1] + ) + + result_md = ( + teams_md.set_index("name") + .league_abbreviation.apply(lambda abbr: abbr_md[0].loc[abbr]) + .rename("league") + ) + + result_pd = ( + teams_pd.set_index("name") + .league_abbreviation.apply(lambda abbr: abbr_pd[0].loc[abbr]) + .rename("league") + ) + df_equals(result_md, result_pd) diff --git a/modin/tests/pandas/native_df_mode/test_window.py b/modin/tests/pandas/native_df_mode/test_window.py new file mode 100644 index 00000000000..7e8e5da9342 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/test_window.py @@ -0,0 +1,101 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from itertools import product + +import matplotlib +import numpy as np +import pandas +import pytest + +import modin.pandas as pd +from modin.config import NativeDataframeMode, NPartitions +from modin.tests.pandas.native_df_mode.utils import create_test_df_in_defined_mode +from modin.tests.pandas.utils import df_equals + +NPartitions.put(4) + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_fillna_4660(df_mode_pair): + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + {"a": ["a"], "b": ["b"], "c": [pd.NA]}, + index=["row1"], + df_mode=df_mode_pair[0], + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + {"a": ["a"], "b": ["b"], "c": [pd.NA]}, + index=["row1"], + df_mode=df_mode_pair[1], + ) + modin_result = modin_df_1["c"].fillna(modin_df_2["b"]) + pandas_result = pandas_df_1["c"].fillna(pandas_df_2["b"]) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_fillna_dict_series(df_mode_pair): + frame_data = { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + frame_data, df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + frame_data, df_mode=df_mode_pair[1] + ) + + df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) + + df_equals( + modin_df.fillna({"a": 0, "b": 5, "d": 7}), + df.fillna({"a": 0, "b": 5, "d": 7}), + ) + + # Series treated same as dict + df_equals( + modin_df_1.fillna(modin_df_2.max()), pandas_df_1.fillna(pandas_df_2.max()) + ) + + +@pytest.mark.parametrize( + "df_mode_pair", list(product(NativeDataframeMode.choices, repeat=2)) +) +def test_fillna_dataframe(df_mode_pair): + frame_data = { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + modin_df_1, pandas_df_1 = create_test_df_in_defined_mode( + frame_data, index=list("VWXYZ"), df_mode=df_mode_pair[0] + ) + modin_df_2, pandas_df_2 = create_test_df_in_defined_mode( + {"a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5}, + index=list("VWXuZ"), + df_mode=df_mode_pair[1], + ) + + # only those columns and indices which are shared get filled + df_equals(modin_df_1.fillna(modin_df_2), pandas_df_1.fillna(pandas_df_2)) diff --git a/modin/tests/pandas/native_df_mode/utils.py b/modin/tests/pandas/native_df_mode/utils.py new file mode 100644 index 00000000000..9e9d77ac1f7 --- /dev/null +++ b/modin/tests/pandas/native_df_mode/utils.py @@ -0,0 +1,133 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from modin.config import Engine +from modin.config.pubsub import context +from modin.tests.pandas.utils import ( + NoModinException, + create_test_dfs, + create_test_series, + df_equals, +) +from modin.utils import try_cast_to_pandas + + +def create_test_df_in_defined_mode( + *args, post_fn=None, backend=None, df_mode=None, **kwargs +): + with context(NativeDataframeMode=df_mode): + return create_test_dfs(*args, post_fn=post_fn, backend=backend, **kwargs) + + +def create_test_series_in_defined_mode( + vals, sort=False, backend=None, df_mode=None, **kwargs +): + with context(NativeDataframeMode=df_mode): + return create_test_series(vals, sort=sort, backend=backend, **kwargs) + + +def eval_general_interop( + data, + backend, + operation, + df_mode_pair, + comparator=df_equals, + __inplace__=False, + expected_exception=None, + check_kwargs_callable=True, + md_extra_kwargs=None, + comparator_kwargs=None, + **kwargs, +): + df_mode1, df_mode2 = df_mode_pair + modin_df1, pandas_df1 = create_test_df_in_defined_mode( + data, backend=backend, df_mode=df_mode1 + ) + modin_df2, pandas_df2 = create_test_df_in_defined_mode( + data, backend=backend, df_mode=df_mode2 + ) + md_kwargs, pd_kwargs = {}, {} + + def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}): + try: + pd_result = fn(pandas_df1, pandas_df2, **pd_kwargs) + except Exception as pd_e: + try: + if inplace: + _ = fn(modin_df1, modin_df2, **md_kwargs) + try_cast_to_pandas(modin_df1) # force materialization + else: + try_cast_to_pandas( + fn(modin_df1, modin_df2, **md_kwargs) + ) # force materialization + except Exception as md_e: + assert isinstance( + md_e, type(pd_e) + ), "Got Modin Exception type {}, but pandas Exception type {} was expected".format( + type(md_e), type(pd_e) + ) + if expected_exception: + if Engine.get() == "Ray": + from ray.exceptions import RayTaskError + + # unwrap ray exceptions from remote worker + if isinstance(md_e, RayTaskError): + md_e = md_e.args[0] + assert ( + type(md_e) is type(expected_exception) + and md_e.args == expected_exception.args + ), f"not acceptable Modin's exception: [{repr(md_e)}]" + assert ( + pd_e.args == expected_exception.args + ), f"not acceptable Pandas' exception: [{repr(pd_e)}]" + elif expected_exception is False: + # The only way to disable exception message checking. + pass + else: + # It’s not enough that Modin and pandas have the same types of exceptions; + # we need to explicitly specify the instance of an exception + # (using `expected_exception`) in tests so that we can check exception messages. + # This allows us to eliminate situations where exceptions are thrown + # that we don't expect, which could hide different bugs. + raise pd_e + else: + raise NoModinException( + f"Modin doesn't throw an exception, while pandas does: [{repr(pd_e)}]" + ) + else: + md_result = fn(modin_df1, modin_df2, **md_kwargs) + return (md_result, pd_result) if not inplace else (modin_df1, pandas_df1) + + for key, value in kwargs.items(): + if check_kwargs_callable and callable(value): + values = execute_callable(value) + # that means, that callable raised an exception + if values is None: + return + else: + md_value, pd_value = values + else: + md_value, pd_value = value, value + + md_kwargs[key] = md_value + pd_kwargs[key] = pd_value + + if md_extra_kwargs: + assert isinstance(md_extra_kwargs, dict) + md_kwargs.update(md_extra_kwargs) + + values = execute_callable( + operation, md_kwargs=md_kwargs, pd_kwargs=pd_kwargs, inplace=__inplace__ + ) + if values is not None: + comparator(*values, **(comparator_kwargs or {})) From 156cd51fd779fbff5a9e5da928c5b3624114b185 Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Fri, 6 Sep 2024 15:33:08 +0200 Subject: [PATCH 16/20] DOCS-#7382: Add documentation on how to use Modin Native query compiler (#7386) Co-authored-by: Iaroslav Igoshev Signed-off-by: arunjose696 --- docs/usage_guide/optimization_notes/index.rst | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/usage_guide/optimization_notes/index.rst b/docs/usage_guide/optimization_notes/index.rst index 0dcbe5a25d7..6e9d1ca7d63 100644 --- a/docs/usage_guide/optimization_notes/index.rst +++ b/docs/usage_guide/optimization_notes/index.rst @@ -314,6 +314,37 @@ Copy-pastable example, showing how mixing pandas and Modin DataFrames in a singl # Possible output: TypeError +Execute DataFrame operations using NativeQueryCompiler +"""""""""""""""""""""""""""""""""""""""""""""""""""""" + +By default, Modin distributes data across partitions and performs operations +using the ``PandasQueryCompiler``. However, for certain scenarios such as handling small or empty DataFrames, +distributing them may introduce unnecessary overhead. In such cases, it's more efficient to default +to pandas at the query compiler layer. This can be achieved by setting the ``cfg.NativeDataframeMode`` +:doc:`configuration variable: ` to ``Pandas``. When set to ``Pandas``, all operations in Modin default to pandas, and the DataFrames are not distributed, +avoiding additional overhead. This configuration can be toggled on or off depending on whether +DataFrame distribution is required. + +DataFrames created while the ``NativeDataframeMode`` is active will continue to use the ``NativeQueryCompiler`` +even after the config is disabled. Modin supports interoperability between distributed Modin DataFrames and +those using the ``NativeQueryCompiler``. + +.. code-block:: python + + import modin.pandas as pd + import modin.config as cfg + + # This dataframe will be distributed and use `PandasQueryCompiler` by default + df_distributed = pd.DataFrame(...) + + # Set mode to "Pandas" to avoid distribution and use `NativeQueryCompiler` + cfg.NativeDataframeMode.put("Pandas") + df_native_qc = pd.DataFrame(...) + + # Revert to default settings for distributed dataframes + cfg.NativeDataframeMode.put("Default") + df_distributed = pd.DataFrame(...) + Operation-specific optimizations """""""""""""""""""""""""""""""" From f3c0a63579bb6cee861ea04344ddedd72221634e Mon Sep 17 00:00:00 2001 From: Iaroslav Igoshev Date: Fri, 6 Sep 2024 18:34:43 +0200 Subject: [PATCH 17/20] FIX-#7387: Limit the number of pytest workers for tests with Ray engine on Windows (#7388) Signed-off-by: Igoshev, Iaroslav --- .github/workflows/ci.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8fb26225613..9b0d5b49783 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -443,12 +443,28 @@ jobs: - run: python -m pytest -n 2 modin/tests/experimental/test_pipeline.py if: matrix.engine == 'python' || matrix.test_task == 'group_1' - uses: ./.github/actions/run-core-tests/group_1 + with: + # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. + # See https://github.com/modin-project/modin/issues/7387. + parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.test_task == 'group_1' - uses: ./.github/actions/run-core-tests/group_2 + with: + # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. + # See https://github.com/modin-project/modin/issues/7387. + parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.test_task == 'group_2' - uses: ./.github/actions/run-core-tests/group_3 + with: + # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. + # See https://github.com/modin-project/modin/issues/7387. + parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.test_task == 'group_3' - uses: ./.github/actions/run-core-tests/group_4 + with: + # When running with Ray engine on Windows using 2 pytest workers tests are failing in CI. + # See https://github.com/modin-project/modin/issues/7387. + parallel: ${{ matrix.engine == 'ray' && matrix.os == 'windows' && '-n 1' || '-n 2' }} if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest -n 2 modin/tests/numpy if: matrix.engine == 'python' || matrix.test_task == 'group_4' From 33577098afa51d6c96ac154af88f9680fc4abf8b Mon Sep 17 00:00:00 2001 From: Iaroslav Igoshev Date: Mon, 9 Sep 2024 16:32:37 +0200 Subject: [PATCH 18/20] FIX-#7389: Fix uploading artifacts (#7390) Signed-off-by: Igoshev, Iaroslav --- .github/actions/upload-coverage/action.yml | 1 + .github/workflows/ci.yml | 2 ++ .github/workflows/fuzzydata-test.yml | 1 + .github/workflows/publish-to-pypi.yml | 1 + 4 files changed, 5 insertions(+) diff --git a/.github/actions/upload-coverage/action.yml b/.github/actions/upload-coverage/action.yml index 07c08984111..3d918f12c1c 100644 --- a/.github/actions/upload-coverage/action.yml +++ b/.github/actions/upload-coverage/action.yml @@ -15,3 +15,4 @@ runs: with: name: coverage-data-${{ env.COVERAGE_UUID }} path: .coverage* + include-hidden-files: true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9b0d5b49783..450d1d01a5d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -235,6 +235,7 @@ jobs: with: name: Benchmarks log path: asv_bench/benchmarks.log + include-hidden-files: true if: failure() execution-filter: @@ -737,6 +738,7 @@ jobs: with: name: coverage-data pattern: coverage-data-* + include-hidden-files: true delete-merged: true upload-coverage: diff --git a/.github/workflows/fuzzydata-test.yml b/.github/workflows/fuzzydata-test.yml index c9b2b2a4a29..2dd86ad9dd6 100644 --- a/.github/workflows/fuzzydata-test.yml +++ b/.github/workflows/fuzzydata-test.yml @@ -48,3 +48,4 @@ jobs: name: fuzzydata-test-workflow-${{matrix.engine}} path: /tmp/fuzzydata-test-wf-${{matrix.engine}}/* # Must match output dir in test_fuzzydata.py if-no-files-found: error + include-hidden-files: true diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 514a72481cb..e2beac1eac9 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -38,6 +38,7 @@ jobs: with: name: modin-wheel-and-source-tarball path: ./dist/ + include-hidden-files: true - name: Publish Modin wheel to PyPI if: github.event_name == 'push' From 3e951a63084a9cbfd5e73f6f36653ee12d2a2bfa Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 11 Sep 2024 14:51:21 +0200 Subject: [PATCH 19/20] Release version 0.32.0 (#7393) Signed-off-by: Anatoly Myachev From 05f5e7d2f9aedcecc3f26e42be76d94ec5faf713 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Sat, 14 Sep 2024 06:43:16 -0700 Subject: [PATCH 20/20] FIX-#7375: Fix Series.duplicated dropping name (#7395) * FIX-#7375: Fix Series.duplicated dropping name Signed-off-by: Jonathan Shi * Update modin/pandas/series.py Co-authored-by: Anatoly Myachev --------- Signed-off-by: Jonathan Shi Co-authored-by: Anatoly Myachev --- modin/pandas/series.py | 7 ++++++- modin/tests/pandas/test_series.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d18a0bec778..00083200762 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1022,7 +1022,12 @@ def duplicated(self, keep="first") -> Series: # noqa: PR01, RT01, D200 """ Indicate duplicate Series values. """ - return self.to_frame().duplicated(keep=keep) + name = self.name + result = self.to_frame().duplicated(keep=keep) + # DataFrame.duplicated drops the name, so we need to manually restore it + if name is not None: + result.name = name + return result def eq( self, other, level=None, fill_value=None, axis=0 diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 9dd8b98aac3..b283a7a1ede 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1942,6 +1942,12 @@ def test_duplicated(data, keep): df_equals(modin_result, pandas_series.duplicated(keep=keep)) +def test_duplicated_keeps_name_issue_7375(): + # Ensure that the name property of a series is preserved across duplicated + modin_series, pandas_series = create_test_series([1, 2, 3, 1], name="a") + df_equals(modin_series.duplicated(), pandas_series.duplicated()) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_empty(data): modin_series, pandas_series = create_test_series(data)