From 2e5539d455aa77dfd7f25c3cfa0550343fc96729 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 25 Jan 2024 13:56:48 +0100 Subject: [PATCH 1/6] Test for indexing with lists (#1332) * Test for indexing with lists * release note * Temporarilly comment out test --- anndata/_core/index.py | 17 ++++--------- anndata/tests/helpers.py | 10 ++++++++ anndata/tests/test_hdf5_backing.py | 4 ++-- anndata/tests/test_views.py | 38 +++++++++++++++--------------- docs/release-notes/0.10.5.md | 1 + 5 files changed, 36 insertions(+), 34 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 99489804b..15d840945 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -42,7 +42,7 @@ def _normalize_index( | np.integer | int | str - | Sequence[int | np.integer] + | Sequence[bool | int | np.integer] | np.ndarray | pd.Index, index: pd.Index, @@ -80,18 +80,9 @@ def name_idx(i): indexer = indexer.toarray() indexer = np.ravel(indexer) if not isinstance(indexer, (np.ndarray, pd.Index)): - dtype = "int" - if ( - all(isinstance(x, str) for x in indexer) and len(indexer) > 0 - ): # if not all, but any, then dtype=int will cause an error - dtype = "object" - try: - indexer = np.array(indexer, dtype=dtype) - except ValueError as e: - if str(e).startswith("invalid literal for"): - msg = "Mixed type list indexers not supported." - raise ValueError(msg) from e - raise e + indexer = np.array(indexer) + if len(indexer) == 0: + indexer = indexer.astype(int) if issubclass(indexer.dtype.type, (np.integer, np.floating)): return indexer # Might not work for range indexes elif issubclass(indexer.dtype.type, np.bool_): diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index a62d890d0..a6aabcabf 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -284,6 +284,10 @@ def array_bool_subset(index, min_size=2): return b +def list_bool_subset(index, min_size=2): + return array_bool_subset(index, min_size=min_size).tolist() + + def matrix_bool_subset(index, min_size=2): with warnings.catch_warnings(): warnings.simplefilter("ignore", PendingDeprecationWarning) @@ -321,6 +325,10 @@ def array_int_subset(index, min_size=2): ) +def list_int_subset(index, min_size=2): + return array_int_subset(index, min_size=min_size).tolist() + + def slice_subset(index, min_size=2): while True: points = np.random.choice(np.arange(len(index) + 1), size=2, replace=False) @@ -340,7 +348,9 @@ def single_subset(index): slice_subset, single_subset, array_int_subset, + list_int_subset, array_bool_subset, + list_bool_subset, matrix_bool_subset, spmatrix_bool_subset, ] diff --git a/anndata/tests/test_hdf5_backing.py b/anndata/tests/test_hdf5_backing.py index 94f2af4b0..f7791de62 100644 --- a/anndata/tests/test_hdf5_backing.py +++ b/anndata/tests/test_hdf5_backing.py @@ -193,8 +193,8 @@ def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray - and isinstance(obs_idx, (np.ndarray, sparse.spmatrix)) - and isinstance(var_idx, (np.ndarray, sparse.spmatrix)) + and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix)) + and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix)) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" diff --git a/anndata/tests/test_views.py b/anndata/tests/test_views.py index 08ec13cdd..dca77265d 100644 --- a/anndata/tests/test_views.py +++ b/anndata/tests/test_views.py @@ -703,22 +703,22 @@ def test_empty_list_subset(): assert subset.varm["sparse"].shape == (0, 100) -@pytest.mark.parametrize("dim", ["obs", "var"]) -@pytest.mark.parametrize( - ("idx", "pat"), - [ - pytest.param( - [1, "cell_c"], r"Mixed type list indexers not supported", id="mixed" - ), - pytest.param( - [[1, 2], [2]], r"setting an array element with a sequence", id="nested" - ), - ], -) -def test_subset_errors(dim, idx, pat): - orig = gen_adata((10, 10)) - with pytest.raises(ValueError, match=pat): - if dim == "obs": - orig[idx, :].X - elif dim == "var": - orig[:, idx].X +# @pytest.mark.parametrize("dim", ["obs", "var"]) +# @pytest.mark.parametrize( +# ("idx", "pat"), +# [ +# pytest.param( +# [1, "cell_c"], r"Mixed type list indexers not supported", id="mixed" +# ), +# pytest.param( +# [[1, 2], [2]], r"setting an array element with a sequence", id="nested" +# ), +# ], +# ) +# def test_subset_errors(dim, idx, pat): +# orig = gen_adata((10, 10)) +# with pytest.raises(ValueError, match=pat): +# if dim == "obs": +# orig[idx, :].X +# elif dim == "var": +# orig[:, idx].X diff --git a/docs/release-notes/0.10.5.md b/docs/release-notes/0.10.5.md index edaec3463..97635266b 100644 --- a/docs/release-notes/0.10.5.md +++ b/docs/release-notes/0.10.5.md @@ -6,6 +6,7 @@ * Fix outer concatenation along variables when only a subset of objects had an entry in layers {pr}`1291` {user}`ivirshup` * Fix comparison of >2d arrays in `uns` during concatenation {pr}`1300` {user}`ivirshup` * Fix IO with awkward array version 2.5.2 {pr}`1328` {user}`ivirshup` +* Fix bug (introduced in 0.10.4) where indexing an AnnData with `list[bool]` would return the wrong result {pr}`1332` {user}`ivirshup` ```{rubric} Documentation ``` From 3fcd75552e27204135374041e2c6e851df2fe6f7 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 25 Jan 2024 17:33:16 +0100 Subject: [PATCH 2/6] (fix): empty boolean mask on backed sparse matrix (#1321) * (fix): check number of slices * (chore): add tests * (chore): release note * (fix): return "empty" sparse matrix with empty bool mask * (feat): add parametrization to bool test * (chore): remove release note * (chore): comment out test * (fix): add empty list test * (fix): typing out! * (fix): no more sum --- anndata/_core/sparse_dataset.py | 14 ++++++++++---- anndata/tests/test_backed_sparse.py | 25 ++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index c6bc96515..03514809b 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -180,6 +180,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: idxs = np.asarray(row) + if len(idxs) == 0: + return ss.csr_matrix((0, self.shape[1])) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csr_matrix( @@ -214,6 +216,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: idxs = np.asarray(col) + if len(idxs) == 0: + return ss.csc_matrix((self.shape[0], 0)) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csc_matrix( @@ -290,10 +294,12 @@ def mean_slice_length(slices): return floor((slices[-1].stop - slices[0].start) / len(slices)) # heuristic for whether slicing should be optimized - if mean_slice_length(slices) <= 7: - return get_compressed_vectors(mtx, np.where(mask)[0]) - else: - return get_compressed_vectors_for_slices(mtx, slices) + if len(slices) > 0: + if mean_slice_length(slices) <= 7: + return get_compressed_vectors(mtx, np.where(mask)[0]) + else: + return get_compressed_vectors_for_slices(mtx, slices) + return [], [], [0] def get_format(data: ss.spmatrix) -> str: diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 8f3c03d0c..18656bea2 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -27,6 +27,10 @@ def diskfmt(request): return request.param +M = 50 +N = 50 + + @pytest.fixture(scope="function") def ondisk_equivalent_adata( tmp_path: Path, diskfmt: Literal["h5ad", "zarr"] @@ -37,7 +41,7 @@ def ondisk_equivalent_adata( write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) - csr_mem = ad.AnnData(X=sparse.random(50, 50, format="csr", density=0.1)) + csr_mem = ad.AnnData(X=sparse.random(M, N, format="csr", density=0.1)) csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) dense_mem = ad.AnnData(X=csr_mem.X.toarray()) @@ -77,6 +81,25 @@ def callback(func, elem_name, elem, iospec): return csr_mem, csr_disk, csc_disk, dense_disk +@pytest.mark.parametrize( + "empty_mask", [[], np.zeros(M, dtype=bool)], ids=["empty_list", "empty_bool_mask"] +) +def test_empty_backed_indexing( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + empty_mask, +): + csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + + assert_equal(csr_mem.X[empty_mask], csr_disk.X[empty_mask]) + assert_equal(csr_mem.X[:, empty_mask], csc_disk.X[:, empty_mask]) + + # The following do not work because of https://github.com/scipy/scipy/issues/19919 + # Our implementation returns a (0,0) sized matrix but scipy does (1,0). + + # assert_equal(csr_mem.X[empty_mask, empty_mask], csr_disk.X[empty_mask, empty_mask]) + # assert_equal(csr_mem.X[empty_mask, empty_mask], csc_disk.X[empty_mask, empty_mask]) + + def test_backed_indexing( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], subset_func, From 6e60815b8769388c7765c37581a2be8e680d4788 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 25 Jan 2024 18:10:06 +0100 Subject: [PATCH 3/6] Set date on release notes (#1337) --- docs/release-notes/0.10.5.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/0.10.5.md b/docs/release-notes/0.10.5.md index 97635266b..8ffb759a6 100644 --- a/docs/release-notes/0.10.5.md +++ b/docs/release-notes/0.10.5.md @@ -1,4 +1,4 @@ -### 0.10.5 {small}`the future` +### 0.10.5 {small}`2024-01-25` ```{rubric} Bugfix ``` From f539ffc695b9e5aa2b9588764533331f0b47ce6f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 26 Jan 2024 14:07:03 +0100 Subject: [PATCH 4/6] Run CI on release branches (#1339) --- .azure-pipelines.yml | 1 + .github/workflows/benchmark.yml | 2 +- .github/workflows/test-gpu.yml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 1aa0fc288..68ef58741 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -1,5 +1,6 @@ trigger: - main + - "*.*.x" variables: PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b6d3919d7..a86caf98d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -2,7 +2,7 @@ name: Benchmark on: push: - branches: [main] + branches: [main, "[0-9]+.[0-9]+.x"] pull_request: branches: [main] diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index c32ce2492..104396116 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -2,7 +2,7 @@ name: AWS GPU on: push: - branches: [main] + branches: [main, "[0-9]+.[0-9]+.x"] pull_request: types: - labeled From e07477d54c6642a6bcf3dee887f3fc0eb55321ef Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 26 Jan 2024 14:11:21 +0100 Subject: [PATCH 5/6] Start 0.10.6 (#1341) * Start 0.10.6 * add to release latest --- docs/release-notes/0.10.6.md | 10 ++++++++++ docs/release-notes/release-latest.md | 3 +++ 2 files changed, 13 insertions(+) create mode 100644 docs/release-notes/0.10.6.md diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md new file mode 100644 index 000000000..abdb08924 --- /dev/null +++ b/docs/release-notes/0.10.6.md @@ -0,0 +1,10 @@ +### 0.10.6 {small}`the future` + +```{rubric} Bugfix +``` + +```{rubric} Documentation +``` + +```{rubric} Performance +``` diff --git a/docs/release-notes/release-latest.md b/docs/release-notes/release-latest.md index 9f6682d3f..c36f7a8a5 100644 --- a/docs/release-notes/release-latest.md +++ b/docs/release-notes/release-latest.md @@ -5,6 +5,9 @@ ## Version 0.10 +```{include} /release-notes/0.10.6.md +``` + ```{include} /release-notes/0.10.5.md ``` From 299ca9736c115f630304e4b71c167873aaa45677 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 26 Jan 2024 15:39:20 +0100 Subject: [PATCH 6/6] (fix): move zarr import inside try-catch in test helpers (#1343) * (fix): move zarr import inside try-catch * (chore): release note * Update docs/release-notes/0.10.6.md Co-authored-by: Isaac Virshup * Allow dummy instantiation always --------- Co-authored-by: Isaac Virshup Co-authored-by: Philipp A --- anndata/tests/helpers.py | 38 ++++++++++++++++++++++-------------- docs/release-notes/0.10.6.md | 2 ++ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index a6aabcabf..4fb33c039 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -12,7 +12,6 @@ import numpy as np import pandas as pd import pytest -import zarr from pandas.api.types import is_numeric_dtype from scipy import sparse @@ -759,21 +758,30 @@ def shares_memory_sparse(x, y): ), ] +try: + import zarr -class AccessTrackingStore(zarr.DirectoryStore): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._access_count = {} + class AccessTrackingStore(zarr.DirectoryStore): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._access_count = {} - def __getitem__(self, key): - for tracked in self._access_count: - if tracked in key: - self._access_count[tracked] += 1 - return super().__getitem__(key) + def __getitem__(self, key): + for tracked in self._access_count: + if tracked in key: + self._access_count[tracked] += 1 + return super().__getitem__(key) - def get_access_count(self, key): - return self._access_count[key] + def get_access_count(self, key): + return self._access_count[key] - def set_key_trackers(self, keys_to_track): - for k in keys_to_track: - self._access_count[k] = 0 + def set_key_trackers(self, keys_to_track): + for k in keys_to_track: + self._access_count[k] = 0 +except ImportError: + + class AccessTrackingStore: + def __init__(self, *_args, **_kwargs) -> None: + raise ImportError( + "zarr must be imported to create an `AccessTrackingStore` instance." + ) diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md index abdb08924..4a978f4dd 100644 --- a/docs/release-notes/0.10.6.md +++ b/docs/release-notes/0.10.6.md @@ -3,6 +3,8 @@ ```{rubric} Bugfix ``` +* Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold` + ```{rubric} Documentation ```