From cf0d0eaf994021b5405fb54027641c35efe86e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 24 Feb 2025 08:27:39 +0100 Subject: [PATCH 1/3] Default to phony_dims="access" in h5netcdf-backend (#10058) * Default to phony_dims="access" in open_datatree for h5ntecdf-backend. Warn user about behaviour change. * relocate * duplicate as needed in both places * ignore warning * conditionally warn users if phony_dims are found * add test for warning * add whats-new.rst entry * remove unneeded assignment to fix typing * Update doc/whats-new.rst * use phony_dims="access" per default also in open_dataset for h5netcdf backend * fix test * fix whats-new.rst --- doc/whats-new.rst | 6 ++-- xarray/backends/h5netcdf_.py | 39 ++++++++++++++++++++++++++ xarray/tests/test_backends.py | 22 +++++++++++++++ xarray/tests/test_backends_datatree.py | 23 +++++++++++++++ 4 files changed, 88 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d9d4998d983..b98bded5d32 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,11 +33,13 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ - +- Warn instead of raise if phony_dims are detected when using h5netcdf-backend and ``phony_dims=None`` (:issue:`10049`, :pull:`10058`) + By `Kai Mühlbauer `_. Deprecations ~~~~~~~~~~~~ - +- Move from phony_dims=None to phony_dims="access" for h5netcdf-backend(:issue:`10049`, :pull:`10058`) + By `Kai Mühlbauer `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 1474ca0b650..6e0e5a2cf3f 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -372,6 +372,23 @@ def close(self, **kwargs): self._manager.close(**kwargs) +def _check_phony_dims(phony_dims): + emit_phony_dims_warning = False + if phony_dims is None: + emit_phony_dims_warning = True + phony_dims = "access" + return emit_phony_dims_warning, phony_dims + + +def _emit_phony_dims_warning(): + emit_user_level_warning( + "The 'phony_dims' kwarg now defaults to 'access'. " + "Previously 'phony_dims=None' would raise an error. " + "For full netcdf equivalence please use phony_dims='sort'.", + UserWarning, + ) + + class H5netcdfBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the h5netcdf package. @@ -434,6 +451,10 @@ def open_dataset( driver_kwds=None, storage_options: dict[str, Any] | None = None, ) -> Dataset: + # Keep this message for some versions + # remove and set phony_dims="access" above + emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) + filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, @@ -460,6 +481,13 @@ def open_dataset( use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) + + # only warn if phony_dims exist in file + # remove together with the above check + # after some versions + if store.ds._root._phony_dim_count > 0 and emit_phony_dims_warning: + _emit_phony_dims_warning() + return ds def open_datatree( @@ -530,6 +558,10 @@ def open_groups_as_dict( from xarray.core.treenode import NodePath from xarray.core.utils import close_on_error + # Keep this message for some versions + # remove and set phony_dims="access" above + emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) + filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, @@ -542,6 +574,7 @@ def open_groups_as_dict( driver=driver, driver_kwds=driver_kwds, ) + # Check for a group and make it a parent if it exists if group: parent = NodePath("/") / NodePath(group) @@ -571,6 +604,12 @@ def open_groups_as_dict( group_name = str(NodePath(path_group)) groups_dict[group_name] = group_ds + # only warn if phony_dims exist in file + # remove together with the above check + # after some versions + if store.ds._phony_dim_count > 0 and emit_phony_dims_warning: + _emit_phony_dims_warning() + return groups_dict diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0237252f975..83d5afa6a09 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4170,6 +4170,28 @@ def test_roundtrip_complex(self): with self.roundtrip(expected) as actual: assert_equal(expected, actual) + def test_phony_dims_warning(self) -> None: + import h5py + + foo_data = np.arange(125).reshape(5, 5, 5) + bar_data = np.arange(625).reshape(25, 5, 5) + var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data} + with create_tmp_file() as tmp_file: + with h5py.File(tmp_file, "w") as f: + grps = ["bar", "baz"] + for grp in grps: + fx = f.create_group(grp) + for k, v in var.items(): + fx.create_dataset(k, data=v) + with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"): + with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds: + assert ds.dims == { + "phony_dim_0": 5, + "phony_dim_1": 5, + "phony_dim_2": 5, + "phony_dim_3": 25, + } + @requires_h5netcdf @requires_netCDF4 diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 91edba07862..8819735e26a 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -372,6 +372,29 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None: class TestH5NetCDFDatatreeIO(DatatreeIOBase): engine: T_DataTreeNetcdfEngine | None = "h5netcdf" + def test_phony_dims_warning(self, tmpdir) -> None: + filepath = tmpdir + "/phony_dims.nc" + import h5py + + foo_data = np.arange(125).reshape(5, 5, 5) + bar_data = np.arange(625).reshape(25, 5, 5) + var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data} + with h5py.File(filepath, "w") as f: + grps = ["bar", "baz"] + for grp in grps: + fx = f.create_group(grp) + for k, v in var.items(): + fx.create_dataset(k, data=v) + + with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"): + with open_datatree(filepath, engine=self.engine) as tree: + assert tree.bar.dims == { + "phony_dim_0": 5, + "phony_dim_1": 5, + "phony_dim_2": 5, + "phony_dim_3": 25, + } + @pytest.mark.skipif( have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet" From df33a9ae112b093d1995389bf75ddd1cd6ce21f1 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 24 Feb 2025 08:57:13 +0100 Subject: [PATCH 2/3] More permissive Index typing (#10067) * Index.isel: more permissive return type This allows an index to return a new index of another type, e.g., a 1-dimensional CoordinateTransformIndex to return a PandasIndex when a new transform cannot be computed (selection at arbitrary locations). * Index.equals: more permissive `other` type. Xarray alignment logic is such that Xarray indexes are always compared with other indexes of the same type. However, this is not necessarily the case for "meta" indexes (i.e., indexes encapsulating one or more index objects that may have another type) that are dispatching `equals` to their wrapped indexes. --- xarray/core/indexes.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 43e231e84d4..22febf0b707 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -209,7 +209,7 @@ def to_pandas_index(self) -> pd.Index: def isel( self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] - ) -> Self | None: + ) -> Index | None: """Maybe returns a new index from the current index itself indexed by positional indexers. @@ -304,7 +304,7 @@ def reindex_like(self, other: Self) -> dict[Hashable, Any]: """ raise NotImplementedError(f"{self!r} doesn't support re-indexing labels") - def equals(self, other: Self) -> bool: + def equals(self, other: Index) -> bool: """Compare this index with another index of the same type. Implementation is optional but required in order to support alignment. @@ -1424,7 +1424,7 @@ def create_variables( def isel( self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] - ) -> Self | None: + ) -> Index | None: # TODO: support returning a new index (e.g., possible to re-calculate the # the transform or calculate another transform on a reduced dimension space) return None @@ -1486,7 +1486,9 @@ def sel( return IndexSelResult(results) - def equals(self, other: Self) -> bool: + def equals(self, other: Index) -> bool: + if not isinstance(other, CoordinateTransformIndex): + return False return self.transform.equals(other.transform) def rename( From 2475d4965c8b3f69f70b0acc958fa98ce4a45351 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 24 Feb 2025 17:06:59 +0100 Subject: [PATCH 3/3] Restrict fastpath isel indexes to the case of all PandasIndex (#10066) --- doc/whats-new.rst | 2 ++ xarray/core/indexes.py | 9 +++++---- xarray/tests/test_dataset.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b98bded5d32..a10a8c8851f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -55,6 +55,8 @@ Bug fixes - Fix DataArray().drop_attrs(deep=False) and add support for attrs to DataArray()._replace(). (:issue:`10027`, :pull:`10030`). By `Jan Haacker `_. +- Fix ``isel`` for multi-coordinate Xarray indexes (:issue:`10063`, :pull:`10066`). + By `Benoit Bovy `_. Documentation diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 22febf0b707..37711275bce 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1991,10 +1991,11 @@ def isel_indexes( indexes: Indexes[Index], indexers: Mapping[Any, Any], ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]: - # TODO: remove if clause in the future. It should be unnecessary. - # See failure introduced when removed - # https://github.com/pydata/xarray/pull/9002#discussion_r1590443756 - if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()): + # Fast path function _apply_indexes_fast does not work with multi-coordinate + # Xarray indexes (see https://github.com/pydata/xarray/issues/10063). + # -> call it only in the most common case where all indexes are default + # PandasIndex each associated to a single 1-dimensional coordinate. + if any(type(idx) is not PandasIndex for idx in indexes._indexes.values()): return _apply_indexes(indexes, indexers, "isel") else: return _apply_indexes_fast(indexes, indexers, "isel") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c3302dd6c9d..7be2d13f9dd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1593,6 +1593,40 @@ def test_isel_fancy_convert_index_variable(self) -> None: assert "x" not in actual.xindexes assert not isinstance(actual.x.variable, IndexVariable) + def test_isel_multicoord_index(self) -> None: + # regression test https://github.com/pydata/xarray/issues/10063 + # isel on a multi-coordinate index should return a unique index associated + # to each coordinate + class MultiCoordIndex(xr.Index): + def __init__(self, idx1, idx2): + self.idx1 = idx1 + self.idx2 = idx2 + + @classmethod + def from_variables(cls, variables, *, options=None): + idx1 = PandasIndex.from_variables( + {"x": variables["x"]}, options=options + ) + idx2 = PandasIndex.from_variables( + {"y": variables["y"]}, options=options + ) + + return cls(idx1, idx2) + + def create_variables(self, variables=None): + return {**self.idx1.create_variables(), **self.idx2.create_variables()} + + def isel(self, indexers): + idx1 = self.idx1.isel({"x": indexers.get("x", slice(None))}) + idx2 = self.idx2.isel({"y": indexers.get("y", slice(None))}) + return MultiCoordIndex(idx1, idx2) + + coords = xr.Coordinates(coords={"x": [0, 1], "y": [1, 2]}, indexes={}) + ds = xr.Dataset(coords=coords).set_xindex(["x", "y"], MultiCoordIndex) + + ds2 = ds.isel(x=slice(None), y=slice(None)) + assert ds2.xindexes["x"] is ds2.xindexes["y"] + def test_sel(self) -> None: data = create_test_data() int_slicers = {"dim1": slice(None, None, 2), "dim2": slice(2), "dim3": slice(3)}