Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make cudf.pandas proxy array picklable #17929

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/cudf/cudf/pandas/_wrappers/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,23 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
return result


def ndarray__reduce__(self):
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
# As it stands the custom pickling logic used for all other
# proxy types is incompatible with our proxy ndarray. The pickle
# constructor we use to deserialize the other proxy types calls
# object.__new__(type) which you cannot call on subclasses of
# numpy arrays because the new array won't be created with numpy's
# specific memory management logic. Therefore, we have to handle
# serialization separately for proxy arrays.
return (
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
ndarray.__new__,
(
ndarray,
self._fsproxy_wrapped,
),
)


ndarray = make_final_proxy_type(
"ndarray",
cupy.ndarray,
Expand All @@ -140,6 +157,7 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
"__cuda_array_interface__": cuda_array_interface,
"__array_interface__": array_interface,
"__array_ufunc__": ndarray__array_ufunc__,
"__reduce__": ndarray__reduce__,
# ndarrays are unhashable
"__hash__": None,
# iter(cupy-array) produces an iterable of zero-dim device
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1979,3 +1979,18 @@ def test_numpy_data_access():
actual = xs.values.data

assert type(expected) is type(actual)


def test_pickle_round_trip_proxy_numpy_array(array):
arr, proxy_arr = array
pickled_arr = BytesIO()
pickled_proxy_arr = BytesIO()
pickle.dump(arr, pickled_arr)
pickle.dump(proxy_arr, pickled_proxy_arr)

pickled_arr.seek(0)
pickled_proxy_arr.seek(0)

np.testing.assert_equal(
pickle.load(pickled_proxy_arr), pickle.load(pickled_arr)
)
Comment on lines +1984 to +1996
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: This can be simplified a bit, I think. I'll tack it on to another PR.

def test_pickle_round_trip_proxy_numpy_array(array):
    arr, proxy_arr = array
    np.testing.assert_equal(
        pickle.loads(pickle.dumps(proxy_arr)), 
        pickle.loads(pickle.dumps(arr))
    )

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import holoviews as hv
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -71,9 +71,6 @@ def test_holoviews_heatmap(df):
)


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_holoviews_histogram(df):
return get_plot_info(hv.Histogram(df.values))

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -33,19 +33,13 @@ def assert_plots_equal(expect, got):
pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_line():
df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
(data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")

return plt.gca()


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_bar():
data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
ax = data.plot(kind="bar")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -37,9 +37,6 @@ def test_numpy_dot(df):
return np.dot(df, df.T)


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_numpy_fft(sr):
fft = np.fft.fft(sr)
return fft
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import pandas as pd
import pytest
import seaborn as sns
Expand Down Expand Up @@ -54,9 +54,6 @@ def test_scatter(df):
return ax


@pytest.mark.skip(
reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
)
def test_lineplot_with_sns_data():
df = sns.load_dataset("flights")
ax = sns.lineplot(data=df, x="month", y="passengers")
Expand Down
Loading