diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml index 7e875a4b8..81768c776 100644 --- a/.azure-pipelines.yml +++ b/.azure-pipelines.yml @@ -1,11 +1,13 @@ trigger: - main + - "*.*.x" variables: PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip RUN_COVERAGE: no PYTEST_ADDOPTS: --color=yes --junitxml=test-data/test-results.xml - PRERELEASE_DEPENDENCIES: no + DEPENDENCIES_VERSION: "latest" # |"pre-release" | "minimum-version" + TEST_TYPE: "standard" # | "coverage" jobs: - job: PyTest @@ -13,14 +15,20 @@ jobs: vmImage: "ubuntu-22.04" strategy: matrix: - Python3.11: - python.version: "3.11" + Python3.12: + python.version: "3.12" RUN_COVERAGE: yes + TEST_TYPE: "coverage" Python3.9: python.version: "3.9" PreRelease: - python.version: "3.11" - PRERELEASE_DEPENDENCIES: yes + python.version: "3.12" + DEPENDENCIES_VERSION: "pre-release" + TEST_TYPE: "strict-warning" + minimum_versions: + python.version: "3.9" + DEPENDENCIES_VERSION: "minimum" + TEST_TYPE: "coverage" steps: - task: UsePythonVersion@0 inputs: @@ -40,13 +48,20 @@ jobs: python -m pip install --upgrade pip wheel pip install .[dev,test] displayName: "Install dependencies" - condition: eq(variables['PRERELEASE_DEPENDENCIES'], 'no') + condition: eq(variables['DEPENDENCIES_VERSION'], 'latest') + + - script: | + python -m pip install pip wheel tomli packaging pytest-cov + pip install `python3 ci/scripts/min-deps.py pyproject.toml --extra dev test` + pip install --no-deps . + displayName: "Install minimum dependencies" + condition: eq(variables['DEPENDENCIES_VERSION'], 'minimum') - script: | python -m pip install --pre --upgrade pip wheel pip install --pre .[dev,test] displayName: "Install dependencies release candidates" - condition: eq(variables['PRERELEASE_DEPENDENCIES'], 'yes') + condition: eq(variables['DEPENDENCIES_VERSION'], 'pre-release') - script: | pip list @@ -55,18 +70,23 @@ jobs: - script: | pytest displayName: "PyTest" - condition: eq(variables['RUN_COVERAGE'], 'no') + condition: eq(variables['TEST_TYPE'], 'standard') - script: | pytest --cov --cov-report=xml --cov-context=test displayName: "PyTest (coverage)" - condition: eq(variables['RUN_COVERAGE'], 'yes') + condition: eq(variables['TEST_TYPE'], 'coverage') + + - script: | + pytest --strict-warnings + displayName: "PyTest (treat warnings as errors)" + condition: eq(variables['TEST_TYPE'], 'strict-warning') - task: PublishCodeCoverageResults@1 inputs: codeCoverageTool: Cobertura summaryFileLocation: "test-data/coverage.xml" - condition: eq(variables['RUN_COVERAGE'], 'yes') + condition: eq(variables['TEST_TYPE'], 'coverage') - task: PublishTestResults@2 condition: succeededOrFailed() @@ -77,7 +97,7 @@ jobs: - script: bash <(curl -s https://codecov.io/bash) displayName: "Upload to codecov.io" - condition: eq(variables['RUN_COVERAGE'], 'yes') + condition: eq(variables['TEST_TYPE'], 'coverage') - job: CheckBuild pool: @@ -85,8 +105,8 @@ jobs: steps: - task: UsePythonVersion@0 inputs: - versionSpec: "3.11" - displayName: "Use Python 3.11" + versionSpec: "3.12" + displayName: "Use Python 3.12" - script: | python -m pip install --upgrade pip diff --git a/.codecov.yml b/.codecov.yml index 9dd8f244a..68cc92f2d 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -8,7 +8,6 @@ coverage: default: # Require 1% coverage, i.e., always succeed target: 1 - patch: false changes: false comment: diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 6cb0b86c9..e28d0778a 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -56,6 +56,6 @@ body: ```python >>> import anndata, session_info; session_info.show(html=False, dependencies=True) ``` - render: python + render: python validations: required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 5b62547f9..e524628a5 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,5 +1,8 @@ -blank_issues_enabled: false +blank_issues_enabled: true contact_links: - name: Scverse Community Forum url: https://discourse.scverse.org/ about: If you have questions about “How to do X”, please ask them here. + - name: Blank issue + url: https://github.com/scverse/anndata/issues/new + about: For things that don't quite fit elsewhere. Please note that other templates should be used in most cases – this is mainly for use by the developers. diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index b6d3919d7..f46beb094 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -2,7 +2,7 @@ name: Benchmark on: push: - branches: [main] + branches: [main, "[0-9]+.[0-9]+.x"] pull_request: branches: [main] @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python: ["3.11"] + python: ["3.12"] os: [ubuntu-latest] env: diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 000000000..c67973007 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,21 @@ +name: Publish Python Package + +on: + release: + types: [published] + +jobs: + publish: + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write # to authenticate as Trusted Publisher to pypi.org + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: "3.x" + cache: "pip" + - run: pip install build + - run: python -m build + - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index c32ce2492..8b7099f76 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -2,7 +2,7 @@ name: AWS GPU on: push: - branches: [main] + branches: [main, "[0-9]+.[0-9]+.x"] pull_request: types: - labeled @@ -35,9 +35,14 @@ jobs: name: GPU Tests needs: check runs-on: "cirun-aws-gpu--${{ github.run_id }}" + # Setting a timeout of 30 minutes, as the AWS costs money + # At time of writing, a typical run takes about 5 minutes + timeout-minutes: 30 + defaults: run: shell: bash -el {0} + steps: - uses: actions/checkout@v3 with: @@ -49,14 +54,7 @@ jobs: - uses: mamba-org/setup-micromamba@v1 with: micromamba-version: "1.3.1-0" - environment-name: anndata-gpu-ci - create-args: >- - python=3.11 - cupy - numba - pytest - pytest-cov - pytest-xdist + environment-file: ci/gpu_ci.yml init-shell: >- bash generate-run-shell: false @@ -64,8 +62,10 @@ jobs: - name: Install AnnData run: pip install .[dev,test,gpu] - - name: Mamba list - run: micromamba list + - name: Env list + run: | + micromamba list + pip list - name: Run test run: pytest -m gpu --cov --cov-report=xml --cov-context=test -n 4 diff --git a/.gitignore b/.gitignore index dded609a6..88f0e90c3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ # Caches for compiled and downloaded files __pycache__/ /*cache/ +/node_modules/ /data/ # Distribution / packaging diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e903f7205..064bdc00f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,20 +1,20 @@ repos: - - repo: https://github.com/psf/black - rev: 23.9.1 - hooks: - - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: "v0.0.292" + rev: v0.2.2 hooks: - id: ruff + types_or: [python, pyi, jupyter] args: ["--fix"] + - id: ruff-format + types_or: [python, pyi, jupyter] - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.0.3 + rev: v4.0.0-alpha.8 hooks: - id: prettier + exclude_types: + - markdown - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -26,7 +26,6 @@ repos: - id: detect-private-key - id: no-commit-to-branch args: ["--branch=main"] - - repo: https://github.com/codespell-project/codespell rev: v2.2.6 hooks: diff --git a/.readthedocs.yml b/.readthedocs.yml index ec7305492..764eb57bd 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -2,7 +2,7 @@ version: 2 build: os: ubuntu-20.04 tools: - python: "3.11" + python: "3.12" sphinx: configuration: docs/conf.py fail_on_warning: true # do not change or you will be fired diff --git a/anndata/__init__.py b/anndata/__init__.py index 6dd2e2192..6cae971ac 100644 --- a/anndata/__init__.py +++ b/anndata/__init__.py @@ -34,6 +34,7 @@ read_umi_tools, read_zarr, ) +from ._settings import settings from ._warnings import ( ExperimentalFeatureWarning, ImplicitModificationWarning, @@ -75,4 +76,5 @@ def read(*args, **kwargs): "ImplicitModificationWarning", "ExperimentalFeatureWarning", "experimental", + "settings", ] diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index a69730e26..f57dfb272 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -8,6 +8,7 @@ from typing import ( TYPE_CHECKING, ClassVar, + Literal, TypeVar, Union, ) @@ -19,7 +20,7 @@ from anndata._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning from anndata.compat import AwkArray -from ..utils import deprecated, dim_len, ensure_df_homogeneous +from ..utils import deprecated, dim_len, ensure_df_homogeneous, warn_once from .access import ElementRef from .index import _subset from .views import as_view, view_update @@ -61,35 +62,31 @@ def _ipython_key_completions_(self) -> list[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" if isinstance(val, AwkArray): - warnings.warn( + warn_once( "Support for Awkward Arrays is currently experimental. " "Behavior may change in the future. Please report any issues you may encounter!", ExperimentalFeatureWarning, # stacklevel=3, ) - # Prevent from showing up every time an awkward array is used - # You'd think `once` works, but it doesn't at the repl and in notebooks - warnings.filterwarnings( - "ignore", - category=ExperimentalFeatureWarning, - message="Support for Awkward Arrays is currently experimental.*", - ) for i, axis in enumerate(self.axes): - if self.parent.shape[axis] != dim_len(val, i): - right_shape = tuple(self.parent.shape[a] for a in self.axes) - actual_shape = tuple(dim_len(val, a) for a, _ in enumerate(self.axes)) - if actual_shape[i] is None and isinstance(val, AwkArray): - raise ValueError( - f"The AwkwardArray is of variable length in dimension {i}.", - f"Try ak.to_regular(array, {i}) before including the array in AnnData", - ) - else: - raise ValueError( - f"Value passed for key {key!r} is of incorrect shape. " - f"Values of {self.attrname} must match dimensions " - f"{self.axes} of parent. Value had shape {actual_shape} while " - f"it should have had {right_shape}." - ) + if self.parent.shape[axis] == dim_len(val, i): + continue + right_shape = tuple(self.parent.shape[a] for a in self.axes) + actual_shape = tuple(dim_len(val, a) for a, _ in enumerate(self.axes)) + if actual_shape[i] is None and isinstance(val, AwkArray): + dim = ("obs", "var")[i] + msg = ( + f"The AwkwardArray is of variable length in dimension {dim}.", + f"Try ak.to_regular(array, {i}) before including the array in AnnData", + ) + else: + dims = tuple(("obs", "var")[ax] for ax in self.axes) + msg = ( + f"Value passed for key {key!r} is of incorrect shape. " + f"Values of {self.attrname} must match dimensions {dims} of parent. " + f"Value had shape {actual_shape} while it should have had {right_shape}." + ) + raise ValueError(msg) if not self._allow_df and isinstance(val, pd.DataFrame): name = self.attrname.title().rstrip("s") @@ -104,7 +101,7 @@ def attrname(self) -> str: @property @abstractmethod - def axes(self) -> tuple[int, ...]: + def axes(self) -> tuple[Literal[0, 1], ...]: """Which axes of the parent is this aligned to?""" pass @@ -131,7 +128,7 @@ def _view(self, parent: AnnData, subset_idx: I): """Returns a subset copy-on-write view of the object.""" return self._view_class(self, parent, subset_idx) - @deprecated("dict(obj)") + @deprecated("dict(obj)", FutureWarning) def as_dict(self) -> dict: return dict(self) @@ -166,7 +163,10 @@ def __setitem__(self, key: str, value: V): new_mapping[key] = value def __delitem__(self, key: str): - _ = key in self # Make sure it exists before bothering with a copy + if key not in self: + raise KeyError( + "'{key!r}' not found in view of {self.attrname}" + ) # Make sure it exists before bothering with a copy warnings.warn( f"Removing element `.{self.attrname}['{key}']` of view, " "initializing view as actual.", @@ -226,7 +226,7 @@ def attrname(self) -> str: return f"{self.dim}m" @property - def axes(self) -> tuple[int]: + def axes(self) -> tuple[Literal[0, 1]]: """Axes of the parent this is aligned to""" return (self._axis,) @@ -260,7 +260,7 @@ def _validate_value(self, val: V, key: str) -> V: try: pd.testing.assert_index_equal(val.index, self.dim_names) except AssertionError as e: - msg = f"value.index does not match parent’s axis {self.axes[0]} names:\n{e}" + msg = f"value.index does not match parent’s {self.dim} names:\n{e}" raise ValueError(msg) from None else: msg = "Index.equals and pd.testing.assert_index_equal disagree" @@ -361,7 +361,7 @@ def attrname(self) -> str: return f"{self.dim}p" @property - def axes(self) -> tuple[int, int]: + def axes(self) -> tuple[Literal[0], Literal[0]] | tuple[Literal[1], Literal[1]]: """Axes of the parent this is aligned to""" return self._axis, self._axis diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 944fc66a4..7d4f2e573 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -25,11 +25,12 @@ from numpy import ma from pandas.api.types import infer_dtype, is_string_dtype from scipy import sparse -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import issparse from anndata._warnings import ImplicitModificationWarning from .. import utils +from .._settings import settings from ..compat import ( CupyArray, CupySparseMatrix, @@ -39,7 +40,7 @@ _move_adj_mtx, ) from ..logging import anndata_logger as logger -from ..utils import convert_to_dict, dim_len, ensure_df_homogeneous +from ..utils import convert_to_dict, deprecated, dim_len, ensure_df_homogeneous from .access import ElementRef from .aligned_mapping import ( AxisArrays, @@ -74,7 +75,7 @@ class StorageType(Enum): DaskArray = DaskArray CupyArray = CupyArray CupySparseMatrix = CupySparseMatrix - BackedSparseMAtrix = BaseCompressedSparseDataset + BackedSparseMatrix = BaseCompressedSparseDataset @classmethod def classes(cls): @@ -413,8 +414,9 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): self._varp = adata_ref.varp._view(self, vidx) # fix categories uns = copy(adata_ref._uns) - self._remove_unused_categories(adata_ref.obs, obs_sub, uns) - self._remove_unused_categories(adata_ref.var, var_sub, uns) + if settings.remove_unused_categories: + self._remove_unused_categories(adata_ref.obs, obs_sub, uns) + self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = DataFrameView(obs_sub, view_args=(self, "obs")) self._var = DataFrameView(var_sub, view_args=(self, "var")) @@ -592,28 +594,37 @@ def _init_as_actual( # layers self._layers = Layers(self, layers) - def __sizeof__(self, show_stratified=None) -> int: - def get_size(X): - if issparse(X): - X_csr = csr_matrix(X) - return X_csr.data.nbytes + X_csr.indptr.nbytes + X_csr.indices.nbytes + def __sizeof__(self, show_stratified=None, with_disk: bool = False) -> int: + def get_size(X) -> int: + def cs_to_bytes(X) -> int: + return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes) + + if isinstance(X, h5py.Dataset) and with_disk: + return int(np.array(X.shape).prod() * X.dtype.itemsize) + elif isinstance(X, BaseCompressedSparseDataset) and with_disk: + return cs_to_bytes(X._to_backed()) + elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)): + return cs_to_bytes(X) else: return X.__sizeof__() - size = 0 - attrs = list(["_X", "_obs", "_var"]) - attrs_multi = list(["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]) + sizes = {} + attrs = ["X", "_obs", "_var"] + attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"] for attr in attrs + attrs_multi: if attr in attrs_multi: keys = getattr(self, attr).keys() - s = sum([get_size(getattr(self, attr)[k]) for k in keys]) + s = sum(get_size(getattr(self, attr)[k]) for k in keys) else: s = get_size(getattr(self, attr)) if s > 0 and show_stratified: - str_attr = attr.replace("_", ".") + " " * (7 - len(attr)) - print(f"Size of {str_attr}: {'%3.2f' % (s / (1024 ** 2))} MB") - size += s - return size + from tqdm import tqdm + + print( + f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}" + ) + sizes[attr] = s + return sum(sizes.values()) def _gen_repr(self, n_obs, n_vars) -> str: if self.isbacked: @@ -875,23 +886,21 @@ def _prep_dim_index(self, value, attr: str) -> pd.Index: value = pd.Index(value) if not isinstance(value.name, (str, type(None))): value.name = None - # fmt: off if ( - not isinstance(value, pd.RangeIndex) + len(value) > 0 + and not isinstance(value, pd.RangeIndex) and infer_dtype(value) not in ("string", "bytes") ): sample = list(value[: min(len(value), 5)]) - warnings.warn(dedent( + msg = dedent( f""" AnnData expects .{attr}.index to contain strings, but got values like: {sample} Inferred to be: {infer_dtype(value)} """ - ), # noqa - stacklevel=2, ) - # fmt: on + warnings.warn(msg, stacklevel=2) return value def _set_dim_index(self, value: pd.Index, attr: str): @@ -1303,6 +1312,7 @@ def _inplace_subset_var(self, index: Index1D): Same as `adata = adata[:, index]`, but inplace. """ adata_subset = self[:, index].copy() + self._init_as_actual(adata_subset) def _inplace_subset_obs(self, index: Index1D): @@ -1312,6 +1322,7 @@ def _inplace_subset_obs(self, index: Index1D): Same as `adata = adata[index, :]`, but inplace. """ adata_subset = self[index].copy() + self._init_as_actual(adata_subset) # TODO: Update, possibly remove @@ -1597,6 +1608,13 @@ def copy(self, filename: PathLike | None = None) -> AnnData: write_h5ad(filename, self) return read_h5ad(filename, backed=mode) + @deprecated( + "anndata.concat", + FutureWarning, + "See the tutorial for concat at: " + "https://anndata.readthedocs.io/en/latest/concatenation.html", + hide=False, + ) def concatenate( self, *adatas: AnnData, @@ -1820,14 +1838,6 @@ def concatenate( """ from .merge import concat, merge_dataframes, merge_outer, merge_same - warnings.warn( - "The AnnData.concatenate method is deprecated in favour of the " - "anndata.concat function. Please use anndata.concat instead.\n\n" - "See the tutorial for concat at: " - "https://anndata.readthedocs.io/en/latest/concatenation.html", - FutureWarning, - ) - if self.isbacked: raise ValueError("Currently, concatenate only works in memory mode.") diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 5d76fc5ea..bdf1bfe27 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -42,11 +42,11 @@ def _normalize_index( | np.integer | int | str - | Sequence[int | np.integer] + | Sequence[bool | int | np.integer] | np.ndarray | pd.Index, index: pd.Index, -) -> slice | int | np.ndarray: # ndarray of int +) -> slice | int | np.ndarray: # ndarray of int or bool if not isinstance(index, pd.RangeIndex): assert ( index.dtype != float and index.dtype != int @@ -81,6 +81,8 @@ def name_idx(i): indexer = np.ravel(indexer) if not isinstance(indexer, (np.ndarray, pd.Index)): indexer = np.array(indexer) + if len(indexer) == 0: + indexer = indexer.astype(int) if issubclass(indexer.dtype.type, (np.integer, np.floating)): return indexer # Might not work for range indexes elif issubclass(indexer.dtype.type, np.bool_): @@ -90,8 +92,7 @@ def name_idx(i): f"dimension. Boolean index has shape {indexer.shape} while " f"AnnData index has shape {index.shape}." ) - positions = np.where(indexer)[0] - return positions # np.ndarray[int] + return indexer else: # indexer should be string array positions = index.get_indexer(indexer) if np.any(positions < 0): @@ -162,7 +163,10 @@ def _subset_dask(a: DaskArray, subset_idx: Index): def _subset_spmatrix(a: spmatrix, subset_idx: Index): # Correcting for indexing behaviour of sparse.spmatrix if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx): - subset_idx = (subset_idx[0].reshape(-1, 1), *subset_idx[1:]) + first_idx = subset_idx[0] + if issubclass(first_idx.dtype.type, np.bool_): + first_idx = np.where(first_idx)[0] + subset_idx = (first_idx.reshape(-1, 1), *subset_idx[1:]) return a[subset_idx] @@ -186,7 +190,9 @@ def _subset_dataset(d, subset_idx): ordered = list(subset_idx) rev_order = [slice(None) for _ in range(len(subset_idx))] for axis, axis_idx in enumerate(ordered.copy()): - if isinstance(axis_idx, np.ndarray) and axis_idx.dtype.type != bool: + if isinstance(axis_idx, np.ndarray): + if axis_idx.dtype == bool: + axis_idx = np.where(axis_idx)[0] order = np.argsort(axis_idx) ordered[axis] = axis_idx[order] rev_order[axis] = np.argsort(order) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 3bb9970d5..48f36be9d 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -13,11 +13,11 @@ MutableSet, Sequence, ) -from functools import reduce, singledispatch +from functools import partial, reduce, singledispatch from itertools import repeat from operator import and_, or_, sub from typing import Any, Literal, TypeVar -from warnings import filterwarnings, warn +from warnings import warn import numpy as np import pandas as pd @@ -27,8 +27,15 @@ from anndata._warnings import ExperimentalFeatureWarning -from ..compat import AwkArray, CupyArray, CupyCSRMatrix, CupySparseMatrix, DaskArray -from ..utils import asarray, dim_len +from ..compat import ( + AwkArray, + CupyArray, + CupyCSRMatrix, + CupySparseMatrix, + DaskArray, + _map_cat_to_str, +) +from ..utils import asarray, dim_len, warn_once from .anndata import AnnData from .index import _subset, make_slice @@ -134,7 +141,13 @@ def equal_dask_array(a, b) -> bool: @equal.register(np.ndarray) def equal_array(a, b) -> bool: - return equal(pd.DataFrame(a), pd.DataFrame(asarray(b))) + # Reshaping allows us to compare inputs with >2 dimensions + # We cast to pandas since it will still work with non-numeric types + b = asarray(b) + if a.shape != b.shape: + return False + + return equal(pd.DataFrame(a.reshape(-1)), pd.DataFrame(b.reshape(-1))) @equal.register(CupyArray) @@ -212,6 +225,7 @@ def unify_dtypes(dfs: Iterable[pd.DataFrame]) -> list[pd.DataFrame]: For catching cases where pandas would convert to object dtype. """ + dfs = list(dfs) # Get shared categorical columns df_dtypes = [dict(df.dtypes) for df in dfs] columns = reduce(lambda x, y: x.union(y), [df.columns for df in dfs]) @@ -264,7 +278,7 @@ def try_unifying_dtype( dtypes.add(dtype) ordered = ordered | dtype.ordered elif not pd.isnull(dtype): - return False + return None if len(dtypes) > 0 and not ordered: categories = reduce( lambda x, y: x.union(y), @@ -745,9 +759,9 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ) # TODO: behaviour here should be chosen through a merge strategy df = pd.concat( - unify_dtypes([f(x) for f, x in zip(reindexers, arrays)]), - ignore_index=True, + unify_dtypes(f(x) for f, x in zip(reindexers, arrays)), axis=axis, + ignore_index=True, ) df.index = index return df @@ -812,7 +826,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ) -def inner_concat_aligned_mapping(mappings, reindexers=None, index=None, axis=0): +def inner_concat_aligned_mapping(mappings, *, reindexers=None, index=None, axis=0): result = {} for k in intersect_keys(mappings): @@ -871,17 +885,12 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." ) - warn( - "Outer joins on awkward.Arrays will have different return values in the future." + warn_once( + "Outer joins on awkward.Arrays will have different return values in the future. " "For details, and to offer input, please see:\n\n\t" "https://github.com/scverse/anndata/issues/898", ExperimentalFeatureWarning, ) - filterwarnings( - "ignore", - category=ExperimentalFeatureWarning, - message=r"Outer joins on awkward.Arrays will have different return values.*", - ) # all_keys = union_keys(el.fields for el in els if not_missing(el)) reindexers = [] for el in els: @@ -905,11 +914,18 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): def outer_concat_aligned_mapping( - mappings, reindexers=None, index=None, fill_value=None, axis=0 + mappings, *, reindexers=None, index=None, axis=0, fill_value=None ): result = {} ns = [m.parent.shape[axis] for m in mappings] + def missing_element(n: int, axis: Literal[0, 1] = 0) -> np.ndarray: + """Generates value to use when there is a missing element.""" + if axis == 0: + return np.zeros((n, 0), dtype=bool) + else: + return np.zeros((0, n), dtype=bool) + for k in union_keys(mappings): els = [m.get(k, MissingVal) for m in mappings] if reindexers is None: @@ -921,7 +937,7 @@ def outer_concat_aligned_mapping( # We should probably just handle missing elements for all types result[k] = concat_arrays( [ - el if not_missing(el) else np.zeros((n, 0), dtype=bool) + el if not_missing(el) else missing_element(n, axis=axis) for el, n in zip(els, ns) ], cur_reindexers, @@ -1105,12 +1121,18 @@ def concat( ... X=sparse.csr_matrix(np.array([[0, 1], [2, 3]])), ... obs=pd.DataFrame({"group": ["a", "b"]}, index=["s1", "s2"]), ... var=pd.DataFrame(index=["var1", "var2"]), - ... varm={"ones": np.ones((2, 5)), "rand": np.random.randn(2, 3), "zeros": np.zeros((2, 5))}, + ... varm={ + ... "ones": np.ones((2, 5)), + ... "rand": np.random.randn(2, 3), + ... "zeros": np.zeros((2, 5)), + ... }, ... uns={"a": 1, "b": 2, "c": {"c.a": 3, "c.b": 4}}, ... ) >>> b = ad.AnnData( ... X=sparse.csr_matrix(np.array([[4, 5, 6], [7, 8, 9]])), - ... obs=pd.DataFrame({"group": ["b", "c"], "measure": [1.2, 4.3]}, index=["s3", "s4"]), + ... obs=pd.DataFrame( + ... {"group": ["b", "c"], "measure": [1.2, 4.3]}, index=["s3", "s4"] + ... ), ... var=pd.DataFrame(index=["var1", "var2", "var3"]), ... varm={"ones": np.ones((3, 5)), "rand": np.random.randn(3, 5)}, ... uns={"a": 1, "b": 3, "c": {"c.b": 4}}, @@ -1144,7 +1166,7 @@ def concat( >>> (inner.obs_names, inner.var_names) # doctest: +NORMALIZE_WHITESPACE (Index(['s1', 's2', 's3', 's4'], dtype='object'), Index(['var1', 'var2'], dtype='object')) - >>> outer = ad.concat([a, b], join="outer") # Joining on union of variables + >>> outer = ad.concat([a, b], join="outer") # Joining on union of variables >>> outer AnnData object with n_obs × n_vars = 4 × 3 obs: 'group', 'measure' @@ -1239,7 +1261,9 @@ def concat( [pd.Series(dim_indices(a, axis=axis)) for a in adatas], ignore_index=True ) if index_unique is not None: - concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique) + concat_indices = concat_indices.str.cat( + _map_cat_to_str(label_col), sep=index_unique + ) concat_indices = pd.Index(concat_indices) alt_indices = merge_indices( @@ -1252,7 +1276,7 @@ def concat( # Annotation for concatenation axis check_combinable_cols([getattr(a, dim).columns for a in adatas], join=join) concat_annot = pd.concat( - unify_dtypes([getattr(a, dim) for a in adatas]), + unify_dtypes(getattr(a, dim) for a in adatas), join=join, ignore_index=True, ) @@ -1268,37 +1292,30 @@ def concat( X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value) if join == "inner": - layers = inner_concat_aligned_mapping( - [a.layers for a in adatas], axis=axis, reindexers=reindexers - ) - concat_mapping = inner_concat_aligned_mapping( - [getattr(a, f"{dim}m") for a in adatas], index=concat_indices - ) - if pairwise: - concat_pairwise = concat_pairwise_mapping( - mappings=[getattr(a, f"{dim}p") for a in adatas], - shapes=[a.shape[axis] for a in adatas], - join_keys=intersect_keys, - ) - else: - concat_pairwise = {} + concat_aligned_mapping = inner_concat_aligned_mapping + join_keys = intersect_keys elif join == "outer": - layers = outer_concat_aligned_mapping( - [a.layers for a in adatas], reindexers, axis=axis, fill_value=fill_value + concat_aligned_mapping = partial( + outer_concat_aligned_mapping, fill_value=fill_value ) - concat_mapping = outer_concat_aligned_mapping( - [getattr(a, f"{dim}m") for a in adatas], - index=concat_indices, - fill_value=fill_value, + join_keys = union_keys + else: + assert False, f"{join=} should have been validated above by pd.concat" + + layers = concat_aligned_mapping( + [a.layers for a in adatas], axis=axis, reindexers=reindexers + ) + concat_mapping = concat_aligned_mapping( + [getattr(a, f"{dim}m") for a in adatas], index=concat_indices + ) + if pairwise: + concat_pairwise = concat_pairwise_mapping( + mappings=[getattr(a, f"{dim}p") for a in adatas], + shapes=[a.shape[axis] for a in adatas], + join_keys=join_keys, ) - if pairwise: - concat_pairwise = concat_pairwise_mapping( - mappings=[getattr(a, f"{dim}p") for a in adatas], - shapes=[a.shape[axis] for a in adatas], - join_keys=union_keys, - ) - else: - concat_pairwise = {} + else: + concat_pairwise = {} # TODO: Reindex lazily, so we don't have to make those copies until we're sure we need the element alt_mapping = merge( diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 3041c5dee..6dfb89745 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -15,7 +15,10 @@ import collections.abc as cabc import warnings from abc import ABC +from functools import cached_property from itertools import accumulate, chain +from math import floor +from pathlib import Path from typing import TYPE_CHECKING, Literal, NamedTuple import h5py @@ -24,7 +27,7 @@ from scipy.sparse import _sparsetools from anndata._core.index import _fix_slice_bounds -from anndata.compat import H5Group, ZarrGroup +from anndata.compat import H5Group, ZarrArray, ZarrGroup from ..compat import _read_attr @@ -39,6 +42,8 @@ if TYPE_CHECKING: from collections.abc import Iterable, Sequence + from .._types import GroupStorageType + class BackedFormat(NamedTuple): format: str @@ -57,8 +62,17 @@ class BackedSparseMatrix(_cs_matrix): def copy(self) -> ss.spmatrix: if isinstance(self.data, h5py.Dataset): return sparse_dataset(self.data.parent).to_memory() - else: - return super().copy() + if isinstance(self.data, ZarrArray): + import zarr + + return sparse_dataset( + zarr.open( + store=self.data.store, + mode="r", + chunk_store=self.data.chunk_store, # chunk_store is needed, not clear why + )[Path(self.data.path).parent] + ).to_memory() + return super().copy() def _set_many(self, i: Iterable[int], j: Iterable[int], x): """\ @@ -127,7 +141,7 @@ def _offsets( def _get_contiguous_compressed_slice( self, s: slice ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: - new_indptr = self.indptr[s.start : s.stop + 1] + new_indptr = self.indptr[s.start : s.stop + 1].copy() start = new_indptr[0] stop = new_indptr[-1] @@ -154,7 +168,6 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: slice_len(row, self.shape[0]), slice_len(col, self.shape[1]), ) - if out_shape[0] == 1: return self._get_intXslice(slice_as_int(row, self.shape[0]), col) elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]: @@ -167,6 +180,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: idxs = np.asarray(row) + if len(idxs) == 0: + return ss.csr_matrix((0, self.shape[1])) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csr_matrix( @@ -201,6 +216,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: idxs = np.asarray(col) + if len(idxs) == 0: + return ss.csc_matrix((self.shape[0], 0)) if idxs.dtype == bool: idxs = np.where(idxs) return ss.csc_matrix( @@ -236,6 +253,28 @@ def get_compressed_vectors( return data, indices, indptr +def get_compressed_vectors_for_slices( + x: BackedSparseMatrix, slices: Iterable[slice] +) -> tuple[Sequence, Sequence, Sequence]: + indptr_sels = [x.indptr[slice(s.start, s.stop + 1)] for s in slices] + data = np.concatenate([x.data[s[0] : s[-1]] for s in indptr_sels]) + indices = np.concatenate([x.indices[s[0] : s[-1]] for s in indptr_sels]) + # Need to track the size of the gaps in the slices to each indptr subselection + total = indptr_sels[0][0] + offsets = [total] + for i, sel in enumerate(indptr_sels[1:]): + total = (sel[0] - indptr_sels[i][-1]) + total + offsets.append(total) + start_indptr = indptr_sels[0] - offsets[0] + if len(slices) < 2: # there is only one slice so no need to concatenate + return data, indices, start_indptr + end_indptr = np.concatenate( + [s[1:] - offsets[i + 1] for i, s in enumerate(indptr_sels[1:])] + ) + indptr = np.concatenate([start_indptr, end_indptr]) + return data, indices, indptr + + def get_compressed_vector( x: BackedSparseMatrix, idx: int ) -> tuple[Sequence, Sequence, Sequence]: @@ -246,6 +285,23 @@ def get_compressed_vector( return data, indices, indptr +def subset_by_major_axis_mask( + mtx: ss.spmatrix, mask: np.ndarray +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + slices = np.ma.extras._ezclump(mask) + + def mean_slice_length(slices): + return floor(sum(s.stop - s.start for s in slices) / len(slices)) + + # heuristic for whether slicing should be optimized + if len(slices) > 0: + if mean_slice_length(slices) <= 7: + return get_compressed_vectors(mtx, np.where(mask)[0]) + else: + return get_compressed_vectors_for_slices(mtx, slices) + return [], [], [0] + + def get_format(data: ss.spmatrix) -> str: for fmt, _, memory_class in FORMATS: if isinstance(data, memory_class): @@ -280,13 +336,26 @@ def _get_group_format(group) -> str: class BaseCompressedSparseDataset(ABC): """Analogous to :class:`h5py.Dataset ` or `zarr.Array`, but for sparse matrices.""" - def __init__(self, group: h5py.Group | ZarrGroup): + _group: GroupStorageType + + def __init__(self, group: GroupStorageType): type(self)._check_group_format(group) - self.group = group + self._group = group shape: tuple[int, int] """Shape of the matrix.""" + @property + def group(self): + """The group underlying the backed matrix.""" + return self._group + + @group.setter + def group(self, val): + raise AttributeError( + f"Do not reset group on a {type(self)} with {val}. Instead use `sparse_dataset` to make a new class." + ) + @property def backend(self) -> Literal["zarr", "hdf5"]: if isinstance(self.group, ZarrGroup): @@ -341,9 +410,22 @@ def __repr__(self) -> str: return f"{type(self).__name__}: backend {self.backend}, shape {self.shape}, data_dtype {self.dtype}" def __getitem__(self, index: Index | tuple[()]) -> float | ss.spmatrix: - row, col = self._normalize_index(index) + indices = self._normalize_index(index) + row, col = indices mtx = self._to_backed() - sub = mtx[row, col] + + # Handle masked indexing along major axis + if self.format == "csr" and np.array(row).dtype == bool: + sub = ss.csr_matrix( + subset_by_major_axis_mask(mtx, row), shape=(row.sum(), mtx.shape[1]) + )[:, col] + elif self.format == "csc" and np.array(col).dtype == bool: + sub = ss.csc_matrix( + subset_by_major_axis_mask(mtx, col), shape=(mtx.shape[0], col.sum()) + )[row, :] + else: + sub = mtx[row, col] + # If indexing is array x array it returns a backed_sparse_matrix # Not sure what the performance is on that operation if isinstance(sub, BackedSparseMatrix): @@ -354,7 +436,7 @@ def __getitem__(self, index: Index | tuple[()]) -> float | ss.spmatrix: def _normalize_index( self, index: Index | tuple[()] ) -> tuple[np.ndarray, np.ndarray]: - if index == (): + if isinstance(index, tuple) and not len(index): index = slice(None) row, col = unpack_index(index) if all(isinstance(x, cabc.Iterable) for x in (row, col)): @@ -431,12 +513,17 @@ def append(self, sparse_matrix: ss.spmatrix): indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = sparse_matrix.indices + @cached_property + def indptr(self) -> np.ndarray: + arr = self.group["indptr"][...] + return arr + def _to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"] mtx.indices = self.group["indices"] - mtx.indptr = self.group["indptr"][:] + mtx.indptr = self.indptr return mtx def to_memory(self) -> ss.spmatrix: @@ -444,7 +531,7 @@ def to_memory(self) -> ss.spmatrix: mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"][...] mtx.indices = self.group["indices"][...] - mtx.indptr = self.group["indptr"][...] + mtx.indptr = self.indptr return mtx @@ -472,7 +559,7 @@ class CSCDataset(BaseCompressedSparseDataset): format = "csc" -def sparse_dataset(group: ZarrGroup | H5Group) -> CSRDataset | CSCDataset: +def sparse_dataset(group: GroupStorageType) -> CSRDataset | CSCDataset: """Generates a backed mode-compatible sparse dataset class. Parameters @@ -489,12 +576,12 @@ def sparse_dataset(group: ZarrGroup | H5Group) -> CSRDataset | CSCDataset: >>> import zarr >>> from anndata.experimental import sparse_dataset - >>> group = zarr.open_group('./my_test_store.zarr') - >>> group['data'] = [10, 20, 30, 40, 50, 60, 70, 80] - >>> group['indices'] = [0, 1, 1, 3, 2, 3, 4, 5] - >>> group['indptr'] = [0, 2, 4, 7, 8] - >>> group.attrs['shape'] = (4, 6) - >>> group.attrs['encoding-type'] = 'csr_matrix' + >>> group = zarr.open_group("./my_test_store.zarr") + >>> group["data"] = [10, 20, 30, 40, 50, 60, 70, 80] + >>> group["indices"] = [0, 1, 1, 3, 2, 3, 4, 5] + >>> group["indptr"] = [0, 2, 4, 7, 8] + >>> group.attrs["shape"] = (4, 6) + >>> group.attrs["encoding-type"] = "csr_matrix" >>> sparse_dataset(group) CSRDataset: backend zarr, shape (4, 6), data_dtype int64 """ @@ -508,3 +595,30 @@ def sparse_dataset(group: ZarrGroup | H5Group) -> CSRDataset | CSCDataset: @_subset.register(BaseCompressedSparseDataset) def subset_sparsedataset(d, subset_idx): return d[subset_idx] + + +## Backwards compat + +_sparsedataset_depr_msg = """\ +SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset. + +For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead. + +For creation, use `anndata.experimental.sparse_dataset(X)` instead. +""" + + +class SparseDataset(ABC): + """DEPRECATED. + + Use CSRDataset, CSCDataset, and sparse_dataset from anndata.experimental instead. + """ + + def __new__(cls, group): + warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=2) + return sparse_dataset(group) + + @classmethod + def __subclasshook__(cls, C): + warnings.warn(FutureWarning(_sparsedataset_depr_msg), stacklevel=3) + return issubclass(C, (CSRDataset, CSCDataset)) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 36faf5fbe..ce86a27ee 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -395,6 +395,10 @@ def _resolve_idx(old, new, l): @_resolve_idx.register(np.ndarray) def _resolve_idx_ndarray(old, new, l): + if is_bool_dtype(old) and is_bool_dtype(new): + mask_new = np.zeros_like(old) + mask_new[np.flatnonzero(old)[new]] = True + return mask_new if is_bool_dtype(old): old = np.where(old)[0] return old[new] diff --git a/anndata/_io/h5ad.py b/anndata/_io/h5ad.py index bfddc6504..5f31da04a 100644 --- a/anndata/_io/h5ad.py +++ b/anndata/_io/h5ad.py @@ -6,6 +6,7 @@ from types import MappingProxyType from typing import ( TYPE_CHECKING, + Any, Callable, Literal, TypeVar, @@ -29,6 +30,7 @@ ) from ..experimental import read_dispatched from .specs import read_elem, write_elem +from .specs.registry import IOSpec, write_spec from .utils import ( H5PY_V3, _read_legacy_raw, @@ -110,7 +112,14 @@ def write_h5ad( @report_write_key_on_error -def write_sparse_as_dense(f, key, value, dataset_kwargs=MappingProxyType({})): +@write_spec(IOSpec("array", "0.2.0")) +def write_sparse_as_dense( + f: h5py.Group, + key: str, + value: sparse.spmatrix | BaseCompressedSparseDataset, + *, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): real_key = None # Flag for if temporary key was used if key in f: if isinstance(value, BaseCompressedSparseDataset) and ( @@ -267,7 +276,7 @@ def callback(func, elem_name: str, elem, iospec): def _read_raw( f: h5py.File | AnnDataFileManager, as_sparse: Collection[str] = (), - rdasp: Callable[[h5py.Dataset], sparse.spmatrix] = None, + rdasp: Callable[[h5py.Dataset], sparse.spmatrix] | None = None, *, attrs: Collection[str] = ("X", "var", "varm"), ) -> dict: @@ -284,7 +293,7 @@ def _read_raw( @report_read_key_on_error -def read_dataframe_legacy(dataset) -> pd.DataFrame: +def read_dataframe_legacy(dataset: h5py.Dataset) -> pd.DataFrame: """Read pre-anndata 0.7 dataframes.""" warn( f"'{dataset.name}' was written with a very old version of AnnData. " @@ -303,7 +312,7 @@ def read_dataframe_legacy(dataset) -> pd.DataFrame: return df -def read_dataframe(group) -> pd.DataFrame: +def read_dataframe(group: h5py.Group | h5py.Dataset) -> pd.DataFrame: """Backwards compat function""" if not isinstance(group, h5py.Group): return read_dataframe_legacy(group) @@ -350,7 +359,7 @@ def read_dense_as_sparse( raise ValueError(f"Cannot read dense array as type: {sparse_format}") -def read_dense_as_csr(dataset, axis_chunk=6000): +def read_dense_as_csr(dataset: h5py.Dataset, axis_chunk: int = 6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 0, axis_chunk): dense_chunk = dataset[idx] @@ -359,7 +368,7 @@ def read_dense_as_csr(dataset, axis_chunk=6000): return sparse.vstack(sub_matrices, format="csr") -def read_dense_as_csc(dataset, axis_chunk=6000): +def read_dense_as_csc(dataset: h5py.Dataset, axis_chunk: int = 6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 1, axis_chunk): sub_matrix = sparse.csc_matrix(dataset[idx]) diff --git a/anndata/_io/read.py b/anndata/_io/read.py index 68f7fbd27..a50c4b2ef 100644 --- a/anndata/_io/read.py +++ b/anndata/_io/read.py @@ -274,16 +274,16 @@ def read_loom( uns = {} if cleanup: uns_obs = {} - for key in list(obs.keys()): - if len(set(obs[key])) == 1: - uns_obs[f"{key}"] = obs[key][0] + for key in obs.columns: + if len(obs[key].unique()) == 1: + uns_obs[key] = obs[key].iloc[0] del obs[key] if uns_obs: uns["loom-obs"] = uns_obs uns_var = {} - for key in list(var.keys()): - if len(set(var[key])) == 1: - uns_var[f"{key}"] = var[key][0] + for key in var.columns: + if len(var[key].unique()) == 1: + uns_var[key] = var[key].iloc[0] del var[key] if uns_var: uns["loom-var"] = uns_var diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 00cd66ea7..70bd36945 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -645,7 +645,7 @@ def read_awkward(elem, _reader): length = _read_attr(elem.attrs, "length") container = {k: _reader.read_elem(elem[k]) for k in elem.keys()} - return ak.from_buffers(form, length, container) + return ak.from_buffers(form, int(length), container) ############## @@ -663,10 +663,23 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): if reserved in df.columns: raise ValueError(f"{reserved!r} is a reserved name for dataframe columns.") group = f.require_group(key) + if not df.columns.is_unique: + duplicates = list(df.columns[df.columns.duplicated()]) + raise ValueError( + f"Found repeated column names: {duplicates}. Column names must be unique." + ) col_names = [check_key(c) for c in df.columns] group.attrs["column-order"] = col_names if df.index.name is not None: + if df.index.name in col_names and not pd.Series( + df.index, index=df.index + ).equals(df[df.index.name]): + raise ValueError( + f"DataFrame.index.name ({df.index.name!r}) is also used by a column " + "whose values are different. This is not supported. Please make sure " + "the values are the same, or use a different name." + ) index_name = df.index.name else: index_name = "_index" diff --git a/anndata/_io/specs/registry.py b/anndata/_io/specs/registry.py index 1f0b137f4..a8357295d 100644 --- a/anndata/_io/specs/registry.py +++ b/anndata/_io/specs/registry.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Callable, Iterable, Mapping +from collections.abc import Mapping from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType @@ -10,12 +10,13 @@ from anndata.compat import _read_attr if TYPE_CHECKING: + from collections.abc import Callable, Generator, Iterable + from anndata._types import GroupStorageType, StorageType + # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" # TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that? - - @dataclass(frozen=True) class IOSpec: encoding_type: str @@ -25,7 +26,9 @@ class IOSpec: # TODO: Should this subclass from LookupError? class IORegistryError(Exception): @classmethod - def _from_write_parts(cls, dest_type, typ, modifiers) -> IORegistryError: + def _from_write_parts( + cls, dest_type: type, typ: type, modifiers: frozenset[str] + ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: msg += f" with {modifiers}" @@ -36,7 +39,7 @@ def _from_read_parts( cls, method: str, registry: Mapping, - src_typ: StorageType, + src_typ: type[StorageType], spec: IOSpec, ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not @@ -50,7 +53,7 @@ def _from_read_parts( def write_spec(spec: IOSpec): def decorator(func: Callable): @wraps(func) - def wrapper(g, k, *args, **kwargs): + def wrapper(g: GroupStorageType, k: str, *args, **kwargs): result = func(g, k, *args, **kwargs) g[k].attrs.setdefault("encoding-type", spec.encoding_type) g[k].attrs.setdefault("encoding-version", spec.encoding_version) @@ -193,12 +196,12 @@ def proc_spec(spec) -> IOSpec: @proc_spec.register(IOSpec) -def proc_spec_spec(spec) -> IOSpec: +def proc_spec_spec(spec: IOSpec) -> IOSpec: return spec @proc_spec.register(Mapping) -def proc_spec_mapping(spec) -> IOSpec: +def proc_spec_mapping(spec: Mapping[str, str]) -> IOSpec: return IOSpec(**{k.replace("-", "_"): v for k, v in spec.items()}) @@ -213,7 +216,9 @@ def get_spec( ) -def _iter_patterns(elem): +def _iter_patterns( + elem, +) -> Generator[tuple[type, type | str] | tuple[type, type, str], None, None]: """Iterates over possible patterns for an element in order of precedence.""" from anndata.compat import DaskArray @@ -236,40 +241,27 @@ def __init__(self, registry: IORegistry, callback: Callable | None = None) -> No def read_elem( self, elem: StorageType, - modifiers: frozenset(str) = frozenset(), + modifiers: frozenset[str] = frozenset(), ) -> Any: """Read an element from a store. See exported function for more details.""" from functools import partial - read_func = self.registry.get_reader( - type(elem), get_spec(elem), frozenset(modifiers) + iospec = get_spec(elem) + read_func = partial( + self.registry.get_reader(type(elem), iospec, modifiers), + _reader=self, ) - read_func = partial(read_func, _reader=self) - if self.callback is not None: - return self.callback(read_func, elem.name, elem, iospec=get_spec(elem)) - else: + if self.callback is None: return read_func(elem) + return self.callback(read_func, elem.name, elem, iospec=iospec) class Writer: - def __init__( - self, - registry: IORegistry, - callback: Callable[ - [ - GroupStorageType, - str, - StorageType, - dict, - ], - None, - ] - | None = None, - ): + def __init__(self, registry: IORegistry, callback: Callable | None = None): self.registry = registry self.callback = callback - def find_writer(self, dest_type, elem, modifiers): + def find_writer(self, dest_type: type, elem, modifiers: frozenset[str]): for pattern in _iter_patterns(elem): if self.registry.has_writer(dest_type, pattern, modifiers): return self.registry.get_writer(dest_type, pattern, modifiers) @@ -281,10 +273,10 @@ def write_elem( self, store: GroupStorageType, k: str, - elem, + elem: Any, *, - dataset_kwargs=MappingProxyType({}), - modifiers=frozenset(), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + modifiers: frozenset[str] = frozenset(), ): from functools import partial from pathlib import PurePosixPath @@ -313,17 +305,16 @@ def write_elem( _writer=self, ) - if self.callback is not None: - return self.callback( - write_func, - store, - k, - elem, - dataset_kwargs=dataset_kwargs, - iospec=self.registry.get_spec(elem), - ) - else: + if self.callback is None: return write_func(store, k, elem, dataset_kwargs=dataset_kwargs) + return self.callback( + write_func, + store, + k, + elem, + dataset_kwargs=dataset_kwargs, + iospec=self.registry.get_spec(elem), + ) def read_elem(elem: StorageType) -> Any: @@ -346,7 +337,7 @@ def write_elem( k: str, elem: Any, *, - dataset_kwargs: Mapping = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: """ Write an element to a storage group using anndata encoding. diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index e6bccde01..6f46ad7eb 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -1,19 +1,23 @@ from __future__ import annotations from functools import wraps -from typing import Callable, Literal +from typing import TYPE_CHECKING, Callable, Literal, Union, cast from warnings import warn import h5py -from packaging import version - -from anndata.compat import H5Group, ZarrGroup, add_note +from packaging.version import Version from .._core.sparse_dataset import BaseCompressedSparseDataset +from ..compat import H5Group, ZarrGroup, add_note, pairwise + +if TYPE_CHECKING: + from .._types import StorageType + + Storage = Union[StorageType, BaseCompressedSparseDataset] # For allowing h5py v3 # https://github.com/scverse/anndata/issues/442 -H5PY_V3 = version.parse(h5py.__version__).major >= 3 +H5PY_V3 = Version(h5py.__version__).major >= 3 # ------------------------------------------------------------------------------- # Type conversion @@ -151,38 +155,31 @@ class AnnDataReadError(OSError): pass -def _get_parent(elem): - try: - import zarr - except ImportError: - zarr = None - if zarr and isinstance(elem, (zarr.Group, zarr.Array)): - parent = elem.store # Not sure how to always get a name out of this - elif isinstance(elem, BaseCompressedSparseDataset): - parent = elem.group.file.name - else: - parent = elem.file.name - return parent +def _get_display_path(store: Storage) -> str: + """Return an absolute path of an element (always starts with “/”).""" + if isinstance(store, BaseCompressedSparseDataset): + store = store.group + path = store.name or "??" # can be None + return f'/{path.removeprefix("/")}' -def re_raise_error(e, elem, key, op=Literal["read", "writ"]): +def add_key_note( + e: BaseException, store: Storage, path: str, key: str, op: Literal["read", "writ"] +) -> None: if any( f"Error raised while {op}ing key" in note for note in getattr(e, "__notes__", []) ): - raise - else: - parent = _get_parent(elem) - add_note( - e, - f"Error raised while {op}ing key {key!r} of {type(elem)} to " f"{parent}", - ) - raise e + return + + dir = "to" if op == "writ" else "from" + msg = f"Error raised while {op}ing key {key!r} of {type(store)} {dir} {path}" + add_note(e, msg) def report_read_key_on_error(func): """\ - A decorator for zarr element reading which makes keys involved in errors get reported. + A decorator for hdf5/zarr element reading which makes keys involved in errors get reported. Example ------- @@ -200,20 +197,25 @@ def func_wrapper(*args, **kwargs): from anndata._io.specs import Reader # Figure out signature (method vs function) by going through args - for elem in args: - if not isinstance(elem, Reader): + for arg in args: + if not isinstance(arg, Reader): + store = cast("Storage", arg) break + else: + raise ValueError("No element found in args.") try: return func(*args, **kwargs) except Exception as e: - re_raise_error(e, elem, elem.name, "read") + path, key = _get_display_path(store).rsplit("/", 1) + add_key_note(e, store, path or "/", key, "read") + raise return func_wrapper def report_write_key_on_error(func): """\ - A decorator for zarr element reading which makes keys involved in errors get reported. + A decorator for hdf5/zarr element writing which makes keys involved in errors get reported. Example ------- @@ -231,15 +233,18 @@ def func_wrapper(*args, **kwargs): from anndata._io.specs import Writer # Figure out signature (method vs function) by going through args - for i in range(len(args)): - elem = args[i] - key = args[i + 1] - if not isinstance(elem, Writer): + for arg, key in pairwise(args): + if not isinstance(arg, Writer): + store = cast("Storage", arg) break + else: + raise ValueError("No element found in args.") try: return func(*args, **kwargs) except Exception as e: - re_raise_error(e, elem, key, "writ") + path = _get_display_path(store) + add_key_note(e, store, path, key, "writ") + raise return func_wrapper diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index 022ee8a1d..864475848 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -12,16 +12,10 @@ from anndata._warnings import OldFormatWarning from .._core.anndata import AnnData -from ..compat import ( - _clean_uns, - _from_fixed_length_strings, -) +from ..compat import _clean_uns, _from_fixed_length_strings from ..experimental import read_dispatched, write_dispatched from .specs import read_elem -from .utils import ( - _read_legacy_raw, - report_read_key_on_error, -) +from .utils import _read_legacy_raw, report_read_key_on_error if TYPE_CHECKING: from collections.abc import MutableMapping @@ -139,7 +133,7 @@ def read_dataframe_legacy(dataset: zarr.Array) -> pd.DataFrame: @report_read_key_on_error -def read_dataframe(group) -> pd.DataFrame: +def read_dataframe(group: zarr.Group | zarr.Array) -> pd.DataFrame: # Fast paths if isinstance(group, zarr.Array): return read_dataframe_legacy(group) diff --git a/anndata/_settings.py b/anndata/_settings.py new file mode 100644 index 000000000..5128d79b6 --- /dev/null +++ b/anndata/_settings.py @@ -0,0 +1,377 @@ +from __future__ import annotations + +import os +import textwrap +import warnings +from collections.abc import Iterable +from contextlib import contextmanager +from enum import Enum +from inspect import Parameter, signature +from typing import TYPE_CHECKING, Any, NamedTuple, TypeVar + +from anndata.compat.exceptiongroups import add_note + +if TYPE_CHECKING: + from collections.abc import Callable, Sequence + +T = TypeVar("T") + + +class DeprecatedOption(NamedTuple): + option: str + message: str | None + removal_version: str | None + + +# TODO: inherit from Generic[T] as well after python 3.9 is no longer supported +class RegisteredOption(NamedTuple): + option: str + default_value: T + doc: str + validate: Callable[[T], bool] | None + type: object + + +def check_and_get_environ_var( + key: str, + default_value: str, + allowed_values: Sequence[str] | None = None, + cast: Callable[[Any], T] | type[Enum] = lambda x: x, +) -> T: + """Get the environment variable and return it is a (potentially) non-string, usable value. + + Parameters + ---------- + key + The environment variable name. + default_value + The default value for `os.environ.get`. + allowed_values + Allowable string values., by default None + cast + Casting from the string to a (potentially different) python object, by default lambdax:x + + Returns + ------- + The casted value. + """ + environ_value_or_default_value = os.environ.get(key, default_value) + if ( + allowed_values is not None + and environ_value_or_default_value not in allowed_values + ): + warnings.warn( + f'Value "{environ_value_or_default_value}" is not in allowed {allowed_values} for environment variable {key}.\ + Default {default_value} will be used.' + ) + environ_value_or_default_value = default_value + return ( + cast(environ_value_or_default_value) + if not isinstance(cast, type(Enum)) + else cast[environ_value_or_default_value] + ) + + +def check_and_get_bool(option, default_value): + return check_and_get_environ_var( + "ANNDATA_" + option.upper(), + str(int(default_value)), + ["0", "1"], + lambda x: bool(int(x)), + ) + + +_docstring = """ +This manager allows users to customize settings for the anndata package. +Settings here will generally be for advanced use-cases and should be used with caution. + +The following options are available: + +{options_description} + +For setting an option please use :func:`~anndata.settings.override` (local) or set the above attributes directly (global) i.e., `anndata.settings.my_setting = foo`. +For assignment by environment variable, use the variable name in all caps with `ANNDATA_` as the prefix before import of :mod:`anndata`. +For boolean environment variable setting, use 1 for `True` and 0 for `False`. +""" + + +class SettingsManager: + _registered_options: dict[str, RegisteredOption] = {} + _deprecated_options: dict[str, DeprecatedOption] = {} + _config: dict[str, object] = {} + __doc_tmpl__: str = _docstring + + def describe( + self, + option: str | Iterable[str] | None = None, + *, + print_description: bool = True, + ) -> str: + """Print and/or return a (string) description of the option(s). + + Parameters + ---------- + option + Option(s) to be described, by default None (i.e., do all option) + print_description + Whether or not to print the description in addition to returning it., by default True + + Returns + ------- + The description. + """ + if option is None: + return self.describe( + self._registered_options.keys(), print_description=print_description + ) + if isinstance(option, Iterable) and not isinstance(option, str): + return "\n".join( + [self.describe(k, print_description=print_description) for k in option] + ) + registered_option = self._registered_options[option] + doc = registered_option.doc.rstrip("\n") + if option in self._deprecated_options: + opt = self._deprecated_options[option] + if opt.message is not None: + doc += " *" + opt.message + doc += f" {option} will be removed in {opt.removal_version}.*" + if print_description: + print(doc) + return doc + + def deprecate( + self, option: str, removal_version: str, message: str | None = None + ) -> None: + """Deprecate options with a message at a version. + + Parameters + ---------- + option + Which option should be deprecated. + removal_version + The version targeted for removal. + message + A custom message. + """ + self._deprecated_options[option] = DeprecatedOption( + option, message, removal_version + ) + + def register( + self, + option: str, + default_value: T, + description: str, + validate: Callable[[T], bool], + option_type: object | None = None, + get_from_env: Callable[[str, T], T] = lambda x, y: y, + ) -> None: + """Register an option so it can be set/described etc. by end-users + + Parameters + ---------- + option + Option to be set. + default_value + Default value with which to set the option. + description + Description to be used in the docstring. + validate + A function which returns True if the option's value is valid and otherwise should raise a `ValueError` or `TypeError`. + option + Optional override for the option type to be displayed. Otherwise `type(default_value)`. + get_from_env + An optional function which takes as arguments the name of the option and a default value and returns the value from the environment variable `ANNDATA_CAPS_OPTION` (or default if not present). + Default behavior is to return `default_value` without checking the environment. + """ + try: + validate(default_value) + except (ValueError, TypeError) as e: + add_note(e, f"for option {repr(option)}") + raise e + option_type_str = ( + type(default_value).__name__ if option_type is None else str(option_type) + ) + option_type = type(default_value) if option_type is None else option_type + doc = f"""\ + {option}: {option_type_str} + {description} Default value of {default_value}. + """ + doc = textwrap.dedent(doc) + self._registered_options[option] = RegisteredOption( + option, default_value, doc, validate, option_type + ) + self._config[option] = get_from_env(option, default_value) + self._update_override_function_for_new_option(option) + + def _update_override_function_for_new_option( + self, + option: str, + ): + """This function updates the keyword arguments, docstring, and annotations of the `SettingsManager.override` function as the `SettingsManager.register` method is called. + + Parameters + ---------- + option + The option being registered for which the override function needs updating. + """ + option_type = self._registered_options[option].type + # Update annotations for type checking. + self.override.__annotations__[option] = option_type + # __signature__ needs to be updated for tab autocompletion in IPython. + # See https://github.com/ipython/ipython/issues/11624 for inspiration. + self.override.__func__.__signature__ = signature(self.override).replace( + parameters=[ + Parameter(name="self", kind=Parameter.POSITIONAL_ONLY), + *[ + Parameter( + name=k, + annotation=option_type, + kind=Parameter.KEYWORD_ONLY, + ) + for k in self._registered_options + ], + ] + ) + # Update docstring for `SettingsManager.override` as well. + insert_index = self.override.__doc__.find("\n Yields") + option_docstring = "\t" + "\t".join( + self.describe(option, print_description=False).splitlines(keepends=True) + ) + self.override.__func__.__doc__ = ( + self.override.__doc__[:insert_index] + + "\n" + + option_docstring + + self.override.__doc__[insert_index:] + ) + + def __setattr__(self, option: str, val: object) -> None: + """ + Set an option to a value. To see the allowed option to be set and their description, + use describe_option. + + Parameters + ---------- + option + Option to be set. + val + Value with which to set the option. + + Raises + ------ + AttributeError + If the option has not been registered, this function will raise an error. + """ + if hasattr(super(), option): + super().__setattr__(option, val) + elif option not in self._registered_options: + raise AttributeError( + f"{option} is not an available option for anndata.\ + Please open an issue if you believe this is a mistake." + ) + registered_option = self._registered_options[option] + registered_option.validate(val) + self._config[option] = val + + def __getattr__(self, option: str) -> object: + """ + Gets the option's value. + + Parameters + ---------- + option + Option to be got. + + Returns + ------- + Value of the option. + """ + if option in self._deprecated_options: + deprecated = self._deprecated_options[option] + warnings.warn( + DeprecationWarning( + f"{repr(option)} will be removed in {deprecated.removal_version}. " + + deprecated.message + ) + ) + if option in self._config: + return self._config[option] + raise AttributeError(f"{option} not found.") + + def __dir__(self) -> Iterable[str]: + return sorted((*dir(super()), *self._config.keys())) + + def reset(self, option: Iterable[str] | str) -> None: + """ + Resets option(s) to its (their) default value(s). + + Parameters + ---------- + option + The option(s) to be reset. + """ + if isinstance(option, Iterable) and not isinstance(option, str): + for opt in option: + self.reset(opt) + else: + self._config[option] = self._registered_options[option].default_value + + @contextmanager + def override(self, **overrides): + """ + Provides local override via keyword arguments as a context manager. + + Parameters + ---------- + + Yields + ------ + None + """ + restore = {a: getattr(self, a) for a in overrides} + try: + for attr, value in overrides.items(): + setattr(self, attr, value) + yield None + finally: + for attr, value in restore.items(): + setattr(self, attr, value) + + @property + def __doc__(self): + options_description = self.describe(print_description=False) + return self.__doc_tmpl__.format( + options_description=options_description, + ) + + +settings = SettingsManager() + +################################################################################## +# PLACE REGISTERED SETTINGS HERE SO THEY CAN BE PICKED UP FOR DOCSTRING CREATION # +################################################################################## + + +categories_option = "remove_unused_categories" +categories_default_value = True +categories_description = ( + "Whether or not to remove unused categories with :class:`~pandas.Categorical`." +) + + +def validate_bool(val) -> bool: + if not isinstance(val, bool): + raise TypeError(f"{val} not valid boolean") + return True + + +settings.register( + categories_option, + categories_default_value, + categories_description, + validate_bool, + get_from_env=check_and_get_bool, +) + +################################################################################## +################################################################################## diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index d6ab5c2f3..39323d73a 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import sys from codecs import decode from collections.abc import Mapping from contextlib import AbstractContextManager @@ -14,6 +15,7 @@ import h5py import numpy as np import pandas as pd +from packaging.version import Version from scipy.sparse import issparse, spmatrix from .exceptiongroups import add_note # noqa: F401 @@ -34,9 +36,9 @@ class Empty: ############################# -try: +if sys.version_info >= (3, 11): from contextlib import chdir -except ImportError: # Python < 3.11 +else: @dataclass class chdir(AbstractContextManager): @@ -51,6 +53,18 @@ def __exit__(self, *_exc_info) -> None: os.chdir(self._old_cwd.pop()) +if sys.version_info >= (3, 10): + from itertools import pairwise +else: + + def pairwise(iterable): + from itertools import tee + + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + ############################# # Optional deps ############################# @@ -391,3 +405,11 @@ def _safe_transpose(x): return _transpose_by_block(x) else: return x.T + + +def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical: + if Version(pd.__version__) >= Version("2.1"): + # Argument added in pandas 2.1 + return cat.map(str, na_action="ignore") + else: + return cat.map(str) diff --git a/anndata/core.py b/anndata/core.py deleted file mode 100644 index 8e6ef0382..000000000 --- a/anndata/core.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import annotations - -from warnings import warn - -warn("Please only import from anndata, not anndata.core", DeprecationWarning) - -from ._core import * # noqa: F403, E402 diff --git a/anndata/experimental/_dispatch_io.py b/anndata/experimental/_dispatch_io.py index 4df4d417a..2a399d540 100644 --- a/anndata/experimental/_dispatch_io.py +++ b/anndata/experimental/_dispatch_io.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, Any, Callable if TYPE_CHECKING: + from collections.abc import Mapping + from anndata._io.specs import IOSpec from anndata._types import GroupStorageType, StorageType @@ -55,7 +57,7 @@ def write_dispatched( None, ], *, - dataset_kwargs=MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: """ Write elem to store, recursively calling callback at each sub-element. diff --git a/anndata/experimental/merge.py b/anndata/experimental/merge.py index 2413b3348..a3ffc1555 100644 --- a/anndata/experimental/merge.py +++ b/anndata/experimental/merge.py @@ -32,7 +32,7 @@ ) from .._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from .._io.specs import read_elem, write_elem -from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup +from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup, _map_cat_to_str from . import read_dispatched SPARSE_MATRIX = {"csc_matrix", "csr_matrix"} @@ -172,7 +172,10 @@ def write_concat_dense( """ import dask.array as da - darrays = (da.from_array(a, chunks="auto") for a in arrays) + darrays = ( + da.from_array(a, chunks="auto" if a.chunks is None else a.chunks) + for a in arrays + ) res = da.concatenate( [ @@ -381,7 +384,7 @@ def _write_alt_annot(groups, output_group, alt_dim, alt_indices, merge): def _write_dim_annot(groups, output_group, dim, concat_indices, label, label_col, join): concat_annot = pd.concat( - unify_dtypes([read_elem(g[dim]) for g in groups]), + unify_dtypes(read_elem(g[dim]) for g in groups), join=join, ignore_index=True, ) @@ -520,7 +523,7 @@ def concat_on_disk( >>> adata = ad.read_h5ad('merged.h5ad', backed=True) >>> adata.X CSRDataset: backend hdf5, shape (490, 15585), data_dtype float32 - >>> adata.obs['dataset'].value_counts() + >>> adata.obs['dataset'].value_counts() # doctest: +SKIP dataset fetal 344 b_cells 146 @@ -593,7 +596,9 @@ def concat_on_disk( [pd.Series(_df_index(g[dim])) for g in groups], ignore_index=True ) if index_unique is not None: - concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique) + concat_indices = concat_indices.str.cat( + _map_cat_to_str(label_col), sep=index_unique + ) # Resulting indices for {dim} and {alt_dim} concat_indices = pd.Index(concat_indices) diff --git a/anndata/experimental/multi_files/_anncollection.py b/anndata/experimental/multi_files/_anncollection.py index 5a6012709..09533522a 100644 --- a/anndata/experimental/multi_files/_anncollection.py +++ b/anndata/experimental/multi_files/_anncollection.py @@ -15,6 +15,7 @@ from ..._core.merge import concat_arrays, inner_concat_aligned_mapping from ..._core.sparse_dataset import BaseCompressedSparseDataset from ..._core.views import _resolve_idx +from ...compat import _map_cat_to_str ATTRS = ["obs", "obsm", "layers"] @@ -208,7 +209,7 @@ def __getitem__(self, key, use_convert=True): else: if vidx is not None: idx = np.ix_(*idx) if not isinstance(idx[1], slice) else idx - arrs.append(arr[idx]) + arrs.append(arr.iloc[idx] if isinstance(arr, pd.Series) else arr[idx]) if len(arrs) > 1: _arr = _merge(arrs) @@ -492,9 +493,12 @@ def convert(self): :: { - 'X': lambda a: a.toarray() if issparse(a) else a, # densify .X - 'obsm': lambda a: np.asarray(a, dtype='float32'), # change dtype for all keys of .obsm - 'obs': dict(key1 = lambda c: c.astype(str)) # change type only for one key of .obs + # densify .X + "X": lambda a: a.toarray() if issparse(a) else a, + # change dtype for all keys of .obsm + "obsm": lambda a: np.asarray(a, dtype="float32"), + # change type only for one key of .obs + "obs": dict(key1=lambda c: c.astype(str)), } """ return self._convert @@ -721,7 +725,7 @@ def __init__( ) if index_unique is not None: concat_indices = concat_indices.str.cat( - label_col.map(str), sep=index_unique + _map_cat_to_str(label_col), sep=index_unique ) self.obs_names = pd.Index(concat_indices) @@ -816,9 +820,12 @@ def convert(self): :: { - 'X': lambda a: a.toarray() if issparse(a) else a, # densify .X - 'obsm': lambda a: np.asarray(a, dtype='float32'), # change dtype for all keys of .obsm - 'obs': dict(key1 = lambda c: c.astype(str)) # change type only for one key of .obs + # densify .X + "X": lambda a: a.toarray() if issparse(a) else a, + # change dtype for all keys of .obsm + "obsm": lambda a: np.asarray(a, dtype="float32"), + # change type only for one key of .obs + "obs": dict(key1=lambda c: c.astype(str)), } """ return self._convert diff --git a/anndata/readwrite.py b/anndata/readwrite.py deleted file mode 100644 index f3d07f732..000000000 --- a/anndata/readwrite.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import annotations - -from warnings import warn - -warn("Please only import from anndata, not anndata.readwrite", DeprecationWarning) - -from ._io import * # noqa: F403, E402 diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 641bdc791..4fb33c039 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -15,7 +15,7 @@ from pandas.api.types import is_numeric_dtype from scipy import sparse -from anndata import AnnData, Raw +from anndata import AnnData, ExperimentalFeatureWarning, Raw from anndata._core.aligned_mapping import AlignedMapping from anndata._core.sparse_dataset import BaseCompressedSparseDataset from anndata._core.views import ArrayView @@ -256,17 +256,19 @@ def gen_adata( awkward_ragged=gen_awkward((12, None, None)), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) - adata = AnnData( - X=X, - obs=obs, - var=var, - obsm=obsm, - varm=varm, - layers=layers, - obsp=obsp, - varp=varp, - uns=uns, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ExperimentalFeatureWarning) + adata = AnnData( + X=X, + obs=obs, + var=var, + obsm=obsm, + varm=varm, + layers=layers, + obsp=obsp, + varp=varp, + uns=uns, + ) return adata @@ -281,6 +283,10 @@ def array_bool_subset(index, min_size=2): return b +def list_bool_subset(index, min_size=2): + return array_bool_subset(index, min_size=min_size).tolist() + + def matrix_bool_subset(index, min_size=2): with warnings.catch_warnings(): warnings.simplefilter("ignore", PendingDeprecationWarning) @@ -318,6 +324,10 @@ def array_int_subset(index, min_size=2): ) +def list_int_subset(index, min_size=2): + return array_int_subset(index, min_size=min_size).tolist() + + def slice_subset(index, min_size=2): while True: points = np.random.choice(np.arange(len(index) + 1), size=2, replace=False) @@ -337,7 +347,9 @@ def single_subset(index): slice_subset, single_subset, array_int_subset, + list_int_subset, array_bool_subset, + list_bool_subset, matrix_bool_subset, spmatrix_bool_subset, ] @@ -410,7 +422,11 @@ def assert_equal_ndarray(a, b, exact=False, elem_name=None): and len(a.dtype) > 1 and len(b.dtype) > 0 ): - assert_equal(pd.DataFrame(a), pd.DataFrame(b), exact, elem_name) + # Reshaping to allow >2d arrays + assert a.shape == b.shape, format_msg(elem_name) + assert_equal( + pd.DataFrame(a.reshape(-1)), pd.DataFrame(b.reshape(-1)), exact, elem_name + ) else: assert np.all(a == b), format_msg(elem_name) @@ -741,3 +757,31 @@ def shares_memory_sparse(x, y): marks=pytest.mark.gpu, ), ] + +try: + import zarr + + class AccessTrackingStore(zarr.DirectoryStore): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._access_count = {} + + def __getitem__(self, key): + for tracked in self._access_count: + if tracked in key: + self._access_count[tracked] += 1 + return super().__getitem__(key) + + def get_access_count(self, key): + return self._access_count[key] + + def set_key_trackers(self, keys_to_track): + for k in keys_to_track: + self._access_count[k] = 0 +except ImportError: + + class AccessTrackingStore: + def __init__(self, *_args, **_kwargs) -> None: + raise ImportError( + "zarr must be imported to create an `AccessTrackingStore` instance." + ) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 993fb91de..2e996bfc6 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -1,13 +1,19 @@ """Tests related to awkward arrays""" from __future__ import annotations +import warnings + import numpy as np import numpy.testing as npt import pandas as pd import pytest import anndata -from anndata import AnnData, ImplicitModificationWarning, read_h5ad +from anndata import ( + AnnData, + ImplicitModificationWarning, + read_h5ad, +) from anndata.compat import awkward as ak from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward from anndata.utils import dim_len @@ -196,8 +202,8 @@ def reversed(self): ] ), # categorical array - ak.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), - ak.to_categorical(ak.Array([[1, 1, 2], [3, 3]])), + ak.str.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), + ak.str.to_categorical(ak.Array([[1, 1, 2], [3, 3]])), # tyical record type with AIRR data consisting of different dtypes ak.Array( [ @@ -375,10 +381,14 @@ def test_concat_mixed_types(key, arrays, expected, join): to_concat.append(tmp_adata) if isinstance(expected, type) and issubclass(expected, Exception): - with pytest.raises(expected): + with pytest.raises(expected), warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", + FutureWarning, + ) anndata.concat(to_concat, axis=axis, join=join) else: - print(to_concat) result_adata = anndata.concat(to_concat, axis=axis, join=join) result = getattr(result_adata, key).get("test", None) assert_equal(expected, result, exact=True) diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 777f2e430..7ce6860d1 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -1,5 +1,8 @@ from __future__ import annotations +from functools import partial +from typing import TYPE_CHECKING, Callable, Literal + import h5py import numpy as np import pytest @@ -10,7 +13,13 @@ from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset from anndata.experimental import read_dispatched -from anndata.tests.helpers import assert_equal, subset_func +from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func + +if TYPE_CHECKING: + from pathlib import Path + + from numpy.typing import ArrayLike + from pytest_mock import MockerFixture subset_func2 = subset_func @@ -20,15 +29,21 @@ def diskfmt(request): return request.param +M = 50 +N = 50 + + @pytest.fixture(scope="function") -def ondisk_equivalent_adata(tmp_path, diskfmt): +def ondisk_equivalent_adata( + tmp_path: Path, diskfmt: Literal["h5ad", "zarr"] +) -> tuple[AnnData, AnnData, AnnData, AnnData]: csr_path = tmp_path / f"csr.{diskfmt}" csc_path = tmp_path / f"csc.{diskfmt}" dense_path = tmp_path / f"dense.{diskfmt}" write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) - csr_mem = ad.AnnData(X=sparse.random(50, 50, format="csr", density=0.1)) + csr_mem = ad.AnnData(X=sparse.random(M, N, format="csr", density=0.1)) csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) dense_mem = ad.AnnData(X=csr_mem.X.toarray()) @@ -54,7 +69,7 @@ def callback(func, elem_name, elem, iospec): **{k: read_dispatched(v, callback) for k, v in elem.items()} ) if iospec.encoding_type in {"csc_matrix", "csr_matrix"}: - return sparse_dataset(elem)._to_backed() + return sparse_dataset(elem) return func(elem) adata = read_dispatched(f, callback=callback) @@ -68,7 +83,30 @@ def callback(func, elem_name, elem, iospec): return csr_mem, csr_disk, csc_disk, dense_disk -def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2): +@pytest.mark.parametrize( + "empty_mask", [[], np.zeros(M, dtype=bool)], ids=["empty_list", "empty_bool_mask"] +) +def test_empty_backed_indexing( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + empty_mask, +): + csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + + assert_equal(csr_mem.X[empty_mask], csr_disk.X[empty_mask]) + assert_equal(csr_mem.X[:, empty_mask], csc_disk.X[:, empty_mask]) + + # The following do not work because of https://github.com/scipy/scipy/issues/19919 + # Our implementation returns a (0,0) sized matrix but scipy does (1,0). + + # assert_equal(csr_mem.X[empty_mask, empty_mask], csr_disk.X[empty_mask, empty_mask]) + # assert_equal(csr_mem.X[empty_mask, empty_mask], csc_disk.X[empty_mask, empty_mask]) + + +def test_backed_indexing( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + subset_func, + subset_func2, +): csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata obs_idx = subset_func(csr_mem.obs_names) @@ -76,10 +114,121 @@ def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2): assert_equal(csr_mem[obs_idx, var_idx].X, csr_disk[obs_idx, var_idx].X) assert_equal(csr_mem[obs_idx, var_idx].X, csc_disk[obs_idx, var_idx].X) + assert_equal(csr_mem.X[...], csc_disk.X[...]) assert_equal(csr_mem[obs_idx, :].X, dense_disk[obs_idx, :].X) + assert_equal(csr_mem[obs_idx].X, csr_disk[obs_idx].X) assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X) +def make_randomized_mask(size: int) -> np.ndarray: + randomized_mask = np.zeros(size, dtype=bool) + inds = np.random.choice(size, 20, replace=False) + inds.sort() + for i in range(0, len(inds) - 1, 2): + randomized_mask[inds[i] : inds[i + 1]] = True + return randomized_mask + + +def make_alternating_mask(size: int, step: int) -> np.ndarray: + mask_alternating = np.ones(size, dtype=bool) + for i in range(0, size, step): # 5 is too low to trigger new behavior + mask_alternating[i] = False + return mask_alternating + + +# non-random indices, with alternating one false and n true +make_alternating_mask_5 = partial(make_alternating_mask, step=5) +make_alternating_mask_15 = partial(make_alternating_mask, step=15) + + +def make_one_group_mask(size: int) -> np.ndarray: + one_group_mask = np.zeros(size, dtype=bool) + one_group_mask[1 : size // 2] = True + return one_group_mask + + +def make_one_elem_mask(size: int) -> np.ndarray: + one_elem_mask = np.zeros(size, dtype=bool) + one_elem_mask[size // 4] = True + return one_elem_mask + + +# test behavior from https://github.com/scverse/anndata/pull/1233 +@pytest.mark.parametrize( + "make_bool_mask,should_trigger_optimization", + [ + (make_randomized_mask, None), + (make_alternating_mask_15, True), + (make_alternating_mask_5, False), + (make_one_group_mask, True), + (make_one_elem_mask, False), + ], + ids=["randomized", "alternating_15", "alternating_5", "one_group", "one_elem"], +) +def test_consecutive_bool( + mocker: MockerFixture, + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + make_bool_mask: Callable[[int], np.ndarray], + should_trigger_optimization: bool | None, +): + """Tests for optimization from https://github.com/scverse/anndata/pull/1233 + + Parameters + ---------- + mocker + Mocker object + ondisk_equivalent_adata + AnnData objects with sparse X for testing + make_bool_mask + Function for creating a boolean mask. + should_trigger_optimization + Whether or not a given mask should trigger the optimized behavior. + """ + _, csr_disk, csc_disk, _ = ondisk_equivalent_adata + mask = make_bool_mask(csr_disk.shape[0]) + + # indexing needs to be on `X` directly to trigger the optimization. + + # `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where` + from anndata._core import sparse_dataset + + spy = mocker.spy(sparse_dataset, "get_compressed_vectors_for_slices") + assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)]) + if should_trigger_optimization is not None: + assert ( + spy.call_count == 1 if should_trigger_optimization else not spy.call_count + ) + assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]]) + if should_trigger_optimization is not None: + assert ( + spy.call_count == 2 if should_trigger_optimization else not spy.call_count + ) + assert_equal(csr_disk[mask, :], csr_disk[np.where(mask)]) + if should_trigger_optimization is not None: + assert ( + spy.call_count == 3 if should_trigger_optimization else not spy.call_count + ) + subset = csc_disk[:, mask] + assert_equal(subset, csc_disk[:, np.where(mask)[0]]) + if should_trigger_optimization is not None: + assert ( + spy.call_count == 4 if should_trigger_optimization else not spy.call_count + ) + if should_trigger_optimization is not None and not csc_disk.isbacked: + size = subset.shape[1] + if should_trigger_optimization: + subset_subset_mask = np.ones(size).astype("bool") + subset_subset_mask[size // 2] = False + else: + subset_subset_mask = make_one_elem_mask(size) + assert_equal( + subset[:, subset_subset_mask], subset[:, np.where(subset_subset_mask)[0]] + ) + assert ( + spy.call_count == 5 if should_trigger_optimization else not spy.call_count + ), f"Actual count: {spy.call_count}" + + @pytest.mark.parametrize( ["sparse_format", "append_method"], [ @@ -87,7 +236,12 @@ def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2): pytest.param(sparse.csc_matrix, sparse.hstack), ], ) -def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt): +def test_dataset_append_memory( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], + append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix], + diskfmt: Literal["h5ad", "zarr"], +): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -115,7 +269,12 @@ def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt): pytest.param(sparse.csc_matrix, sparse.hstack), ], ) -def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt): +def test_dataset_append_disk( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], + append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix], + diskfmt: Literal["h5ad", "zarr"], +): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -139,6 +298,34 @@ def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt): assert_equal(fromdisk, frommem) +@pytest.mark.parametrize( + ["sparse_format"], + [ + pytest.param(sparse.csr_matrix), + pytest.param(sparse.csc_matrix), + ], +) +def test_indptr_cache( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], +): + path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr + a = sparse_format(sparse.random(10, 10)) + f = zarr.open_group(path, "a") + ad._io.specs.write_elem(f, "X", a) + store = AccessTrackingStore(path) + store.set_key_trackers(["X/indptr"]) + f = zarr.open_group(store, "a") + a_disk = sparse_dataset(f["X"]) + a_disk[:1] + a_disk[3:5] + a_disk[6:7] + a_disk[8:9] + assert ( + store.get_access_count("X/indptr") == 2 + ) # one each for .zarray and actual access + + @pytest.mark.parametrize( ["sparse_format", "a_shape", "b_shape"], [ @@ -146,7 +333,13 @@ def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt): pytest.param("csc", (100, 100), (200, 100)), ], ) -def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt): +def test_wrong_shape( + tmp_path: Path, + sparse_format: Literal["csr", "csc"], + a_shape: tuple[int, int], + b_shape: tuple[int, int], + diskfmt: Literal["h5ad", "zarr"], +): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -167,7 +360,22 @@ def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt): a_disk.append(b_disk) -def test_wrong_formats(tmp_path, diskfmt): +def test_reset_group(tmp_path: Path): + path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr + base = sparse.random(100, 100, format="csr") + + if diskfmt == "zarr": + f = zarr.open_group(path, "a") + else: + f = h5py.File(path, "a") + + ad._io.specs.write_elem(f, "base", base) + disk_mtx = sparse_dataset(f["base"]) + with pytest.raises(AttributeError): + disk_mtx.group = f + + +def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -198,7 +406,7 @@ def test_wrong_formats(tmp_path, diskfmt): assert not np.any((pre_checks != post_checks).toarray()) -def test_anndata_sparse_compat(tmp_path, diskfmt): +def test_anndata_sparse_compat(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -212,3 +420,16 @@ def test_anndata_sparse_compat(tmp_path, diskfmt): ad._io.specs.write_elem(f, "/", base) adata = ad.AnnData(sparse_dataset(f["/"])) assert_equal(adata.X, base) + + +def test_backed_sizeof( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + diskfmt: Literal["h5ad", "zarr"], +): + csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + + assert csr_mem.__sizeof__() == csr_disk.__sizeof__(with_disk=True) + assert csr_mem.__sizeof__() == csc_disk.__sizeof__(with_disk=True) + assert csr_disk.__sizeof__(with_disk=True) == csc_disk.__sizeof__(with_disk=True) + assert csr_mem.__sizeof__() > csr_disk.__sizeof__() + assert csr_mem.__sizeof__() > csc_disk.__sizeof__() diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index 14127271f..9a47e983e 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -12,6 +12,7 @@ from scipy.sparse import csr_matrix, issparse from anndata import AnnData +from anndata._settings import settings from anndata.tests.helpers import assert_equal, gen_adata # some test objects that we use below @@ -399,6 +400,15 @@ def test_slicing_remove_unused_categories(): assert adata[2:4].obs["k"].cat.categories.tolist() == ["b"] +def test_slicing_dont_remove_unused_categories(): + with settings.override(remove_unused_categories=False): + adata = AnnData( + np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"]) + ) + adata._sanitize() + assert adata[2:4].obs["k"].cat.categories.tolist() == ["a", "b"] + + def test_get_subset_annotation(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index ac9f7b0a9..e24431436 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -2,16 +2,18 @@ import warnings from collections.abc import Hashable +from contextlib import nullcontext from copy import deepcopy from functools import partial, singledispatch from itertools import chain, permutations, product -from typing import Any, Callable +from typing import Any, Callable, Literal import numpy as np import pandas as pd import pytest from boltons.iterutils import default_exit, remap, research from numpy import ma +from packaging.version import Version from scipy import sparse from anndata import AnnData, Raw, concat @@ -27,9 +29,14 @@ as_dense_dask_array, assert_equal, gen_adata, + gen_vstr_recarray, ) from anndata.utils import asarray +mark_legacy_concatenate = pytest.mark.filterwarnings( + r"ignore:.*AnnData\.concatenate is deprecated:FutureWarning" +) + @singledispatch def filled_like(a, fill_value=None): @@ -93,7 +100,7 @@ def fill_val(request): @pytest.fixture(params=[0, 1]) -def axis(request): +def axis(request) -> Literal[0, 1]: return request.param @@ -145,6 +152,7 @@ def test_concat_interface_errors(): concat([]) +@mark_legacy_concatenate @pytest.mark.parametrize( ["concat_func", "backwards_compat"], [ @@ -173,6 +181,7 @@ def test_concatenate_roundtrip(join_type, array_type, concat_func, backwards_com assert_equal(result[orig.obs_names].copy(), orig) +@mark_legacy_concatenate def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) @@ -248,6 +257,7 @@ def test_concatenate_dense(): assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) +@mark_legacy_concatenate def test_concatenate_layers(array_type, join_type): adatas = [] for _ in range(5): @@ -307,6 +317,7 @@ def gen_index(n): ] +@mark_legacy_concatenate def test_concatenate_obsm_inner(obsm_adatas): adata = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") @@ -336,6 +347,7 @@ def test_concatenate_obsm_inner(obsm_adatas): pd.testing.assert_frame_equal(true_df, cur_df) +@mark_legacy_concatenate def test_concatenate_obsm_outer(obsm_adatas, fill_val): outer = obsm_adatas[0].concatenate( obsm_adatas[1:], join="outer", fill_value=fill_val @@ -406,6 +418,7 @@ def test_concat_annot_join(obsm_adatas, join_type): ) +@mark_legacy_concatenate def test_concatenate_layers_misaligned(array_type, join_type): adatas = [] for _ in range(5): @@ -419,6 +432,7 @@ def test_concatenate_layers_misaligned(array_type, join_type): assert_equal(merged.X, merged.layers["a"]) +@mark_legacy_concatenate def test_concatenate_layers_outer(array_type, fill_val): # Testing that issue #368 is fixed a = AnnData( @@ -434,6 +448,7 @@ def test_concatenate_layers_outer(array_type, fill_val): ) +@mark_legacy_concatenate def test_concatenate_fill_value(fill_val): def get_obs_els(adata): return { @@ -479,6 +494,7 @@ def get_obs_els(adata): ptr += orig.n_obs +@mark_legacy_concatenate def test_concatenate_dense_duplicates(): X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) @@ -530,6 +546,7 @@ def test_concatenate_dense_duplicates(): ] +@mark_legacy_concatenate def test_concatenate_sparse(): # sparse data from scipy.sparse import csr_matrix @@ -575,6 +592,7 @@ def test_concatenate_sparse(): ] +@mark_legacy_concatenate def test_concatenate_mixed(): X1 = sparse.csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]])) X2 = sparse.csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])) @@ -610,6 +628,7 @@ def test_concatenate_mixed(): assert isinstance(adata_all.layers["counts"], sparse.csr_matrix) +@mark_legacy_concatenate def test_concatenate_with_raw(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) @@ -814,7 +833,8 @@ def gen_dim_array(m): # Check values of included elements full_inds = np.arange(w_pairwise.shape[axis]) - groups = getattr(w_pairwise, dim).groupby("orig").indices + obs_var: pd.DataFrame = getattr(w_pairwise, dim) + groups = obs_var.groupby("orig", observed=True).indices for k, inds in groups.items(): orig_arr = getattr(adatas[k], dim_attr)["arr"] full_arr = getattr(w_pairwise, dim_attr)["arr"] @@ -1000,6 +1020,15 @@ def gen_something(n): return np.random.choice(options)(n) +def gen_3d_numeric_array(n): + return np.random.randn(n, n, n) + + +def gen_3d_recarray(_): + # Ignoring n as it can get quite slow + return gen_vstr_recarray(8, 3).reshape(2, 2, 2) + + def gen_concat_params(unss, compat2result): value_generators = [ lambda x: x, @@ -1008,6 +1037,8 @@ def gen_concat_params(unss, compat2result): gen_list, gen_sparse, gen_something, + gen_3d_numeric_array, + gen_3d_recarray, ] for gen, (mode, result) in product(value_generators, compat2result.items()): yield pytest.param(unss, mode, result, gen) @@ -1089,7 +1120,7 @@ def test_concatenate_uns(unss, merge_strategy, result, value_gen): print(merge_strategy, "\n", unss, "\n", result) result, *unss = permute_nested_values([result] + unss, value_gen) adatas = [uns_ad(uns) for uns in unss] - with pytest.warns(FutureWarning, match=r"concatenate method is deprecated"): + with pytest.warns(FutureWarning, match=r"concatenate is deprecated"): merged = AnnData.concatenate(*adatas, uns_merge=merge_strategy).uns assert_equal(merged, result, elem_name="uns") @@ -1219,6 +1250,32 @@ def test_concat_ordered_categoricals_retained(): assert c.obs["cat_ordered"].cat.ordered +def test_concat_categorical_dtype_promotion(): + """https://github.com/scverse/anndata/issues/1170 + + When concatenating categorical with other dtype, defer to pandas. + """ + a = AnnData( + np.ones((3, 3)), + obs=pd.DataFrame( + {"col": pd.Categorical(["a", "a", "b"])}, + index=[f"cell_{i:02d}" for i in range(3)], + ), + ) + b = AnnData( + np.ones((3, 3)), + obs=pd.DataFrame( + {"col": ["c", "c", "c"]}, + index=[f"cell_{i:02d}" for i in range(3, 6)], + ), + ) + + result = concat([a, b]) + expected = pd.concat([a.obs, b.obs]) + + assert_equal(result.obs, expected) + + def test_bool_promotion(): np_bool = AnnData( np.ones((5, 1)), @@ -1288,14 +1345,24 @@ def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): dim = ("obs", "var")[axis] expected_size = expected_shape(a, b, axis=axis, join=join_type) - result = concat( - {"a": a, "b": b}, - axis=axis, - join=join_type, - merge=merge_strategy, - pairwise=True, - index_unique="-", + + ctx_concat_empty = ( + pytest.warns( + FutureWarning, + match=r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", + ) + if shape[axis] == 0 and Version(pd.__version__) >= Version("2.1") + else nullcontext() ) + with ctx_concat_empty: + result = concat( + {"a": a, "b": b}, + axis=axis, + join=join_type, + merge=merge_strategy, + pairwise=True, + index_unique="-", + ) assert result.shape == expected_size if join_type == "outer": @@ -1344,6 +1411,7 @@ def test_concat_outer_aligned_mapping(elem): check_filled_like(result, elem_name=f"obsm/{elem}") +@mark_legacy_concatenate def test_concatenate_size_0_dim(): # https://github.com/scverse/anndata/issues/526 @@ -1495,3 +1563,22 @@ def test_error_on_mixed_device(): for p in permutations([cp_adata, cp_sparse_adata]): concat(p) + + +def test_concat_on_var_outer_join(array_type): + # https://github.com/scverse/anndata/issues/1286 + a = AnnData( + obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(10)]), + var=pd.DataFrame(index=[f"gene_{i:02d}" for i in range(10)]), + layers={ + "X": array_type(np.ones((10, 10))), + }, + ) + b = AnnData( + obs=pd.DataFrame(index=[f"cell_{i:02d}" for i in range(10)]), + var=pd.DataFrame(index=[f"gene_{i:02d}" for i in range(10, 20)]), + ) + + # This shouldn't error + # TODO: specify expected result while accounting for null value + _ = concat([a, b], join="outer", axis=1) diff --git a/anndata/tests/test_concatenate_disk.py b/anndata/tests/test_concatenate_disk.py index f9eab9540..659fb98cf 100644 --- a/anndata/tests/test_concatenate_disk.py +++ b/anndata/tests/test_concatenate_disk.py @@ -109,7 +109,7 @@ def test_anndatas_without_reindex( M = 50 sparse_fmt = "csr" adatas = [] - for _ in range(5): + for i in range(5): if axis == 0: M = np.random.randint(1, 100) else: @@ -122,6 +122,10 @@ def test_anndatas_without_reindex( sparse_fmt=sparse_fmt, **GEN_ADATA_OOC_CONCAT_ARGS, ) + if axis == 0: + a.obs_names = f"{i}-" + a.obs_names + else: + a.var_names = f"{i}-" + a.var_names adatas.append(a) assert_eq_concat_on_disk( diff --git a/anndata/tests/test_dask.py b/anndata/tests/test_dask.py index 7bd353f24..56cb0f8c8 100644 --- a/anndata/tests/test_dask.py +++ b/anndata/tests/test_dask.py @@ -107,21 +107,20 @@ def test_dask_distributed_write(adata, tmp_path, diskfmt): pth = tmp_path / f"test_write.{diskfmt}" g = as_group(pth, mode="w") - with dd.LocalCluster(n_workers=1, threads_per_worker=1, processes=False) as cluster: - with dd.Client(cluster): - M, N = adata.X.shape - adata.obsm["a"] = da.random.random((M, 10)) - adata.obsm["b"] = da.random.random((M, 10)) - adata.varm["a"] = da.random.random((N, 10)) - orig = adata - if diskfmt == "h5ad": - with pytest.raises( - ValueError, match="Cannot write dask arrays to hdf5" - ): - write_elem(g, "", orig) - return - write_elem(g, "", orig) - curr = read_elem(g) + with dd.LocalCluster( + n_workers=1, threads_per_worker=1, processes=False + ) as cluster, dd.Client(cluster): + M, N = adata.X.shape + adata.obsm["a"] = da.random.random((M, 10)) + adata.obsm["b"] = da.random.random((M, 10)) + adata.varm["a"] = da.random.random((N, 10)) + orig = adata + if diskfmt == "h5ad": + with pytest.raises(ValueError, match="Cannot write dask arrays to hdf5"): + write_elem(g, "", orig) + return + write_elem(g, "", orig) + curr = read_elem(g) with pytest.raises(Exception): assert_equal(curr.obsm["a"], curr.obsm["b"]) diff --git a/anndata/tests/test_dask_view_mem.py b/anndata/tests/test_dask_view_mem.py index bb758a223..8ce952ad6 100644 --- a/anndata/tests/test_dask_view_mem.py +++ b/anndata/tests/test_dask_view_mem.py @@ -1,9 +1,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pytest import anndata as ad +if TYPE_CHECKING: + import pandas as pd + pytest.importorskip("pytest_memray") # ------------------------------------------------------------------------------ @@ -130,7 +135,11 @@ def test_modify_view_X_memory(mapping_name, give_chunks): subset = adata[:N, :N] assert subset.is_view m = subset.X - m[0, 0] = 100 + with pytest.warns( + ad.ImplicitModificationWarning, + match="Trying to modify attribute `.X` of view, initializing view as actual.", + ): + m[0, 0] = 100 # Normally should expect something around 90 kbs @@ -155,5 +164,5 @@ def test_modify_view_mapping_obs_var_memory(attr_name, give_chunks): ) subset = adata[:N, :N] assert subset.is_view - m = getattr(subset, attr_name)["m"] - m[0] = 100 + m: pd.Series = getattr(subset, attr_name)["m"] + m.iloc[0] = 100 diff --git a/anndata/tests/test_deprecations.py b/anndata/tests/test_deprecations.py index 01f202aec..39176e315 100644 --- a/anndata/tests/test_deprecations.py +++ b/anndata/tests/test_deprecations.py @@ -10,10 +10,12 @@ import h5py import numpy as np import pytest +import zarr from scipy import sparse import anndata as ad from anndata import AnnData +from anndata.experimental import CSRDataset, write_elem from anndata.tests.helpers import assert_equal @@ -38,26 +40,24 @@ def test_get_obsvar_array_warn(adata): adata._get_var_array("s1") -# TODO: Why doesn’t this mark work? -# @pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_get_obsvar_array(adata): - with pytest.warns(DeprecationWarning): # Just to hide warnings - assert np.allclose(adata._get_obs_array("a"), adata.obs_vector("a")) - assert np.allclose( - adata._get_obs_array("a", layer="x2"), - adata.obs_vector("a", layer="x2"), - ) - assert np.allclose( - adata._get_obs_array("a", use_raw=True), adata.raw.obs_vector("a") - ) - assert np.allclose(adata._get_var_array("s1"), adata.var_vector("s1")) - assert np.allclose( - adata._get_var_array("s1", layer="x2"), - adata.var_vector("s1", layer="x2"), - ) - assert np.allclose( - adata._get_var_array("s1", use_raw=True), adata.raw.var_vector("s1") - ) + assert np.allclose(adata._get_obs_array("a"), adata.obs_vector("a")) + assert np.allclose( + adata._get_obs_array("a", layer="x2"), + adata.obs_vector("a", layer="x2"), + ) + assert np.allclose( + adata._get_obs_array("a", use_raw=True), adata.raw.obs_vector("a") + ) + assert np.allclose(adata._get_var_array("s1"), adata.var_vector("s1")) + assert np.allclose( + adata._get_var_array("s1", layer="x2"), + adata.var_vector("s1", layer="x2"), + ) + assert np.allclose( + adata._get_var_array("s1", use_raw=True), adata.raw.var_vector("s1") + ) def test_obsvar_vector_Xlayer(adata): @@ -144,3 +144,18 @@ def test_deprecated_sparse_dataset_values(): with pytest.warns(FutureWarning, match="Please use .format"): mtx_backed.format_str + + +def test_deprecated_sparse_dataset(): + from anndata._core.sparse_dataset import SparseDataset + + mem_X = sparse.random(50, 50, format="csr") + g = zarr.group() + write_elem(g, "X", mem_X) + with pytest.warns(FutureWarning, match="SparseDataset is deprecated"): + X = SparseDataset(g["X"]) + + assert isinstance(X, CSRDataset) + + with pytest.warns(FutureWarning, match="SparseDataset is deprecated"): + assert isinstance(X, SparseDataset) diff --git a/anndata/tests/test_hdf5_backing.py b/anndata/tests/test_hdf5_backing.py index 61c0c905c..f7791de62 100644 --- a/anndata/tests/test_hdf5_backing.py +++ b/anndata/tests/test_hdf5_backing.py @@ -81,6 +81,7 @@ def as_dense(request): # TODO: Check to make sure obs, obsm, layers, ... are written and read correctly as well +@pytest.mark.filterwarnings("error") def test_read_write_X(tmp_path, mtx_format, backed_mode, as_dense): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.h5ad" @@ -89,11 +90,11 @@ def test_read_write_X(tmp_path, mtx_format, backed_mode, as_dense): orig = ad.AnnData(mtx_format(asarray(sparse.random(10, 10, format="csr")))) orig.write(orig_pth) - backed = ad.read(orig_pth, backed=backed_mode) + backed = ad.read_h5ad(orig_pth, backed=backed_mode) backed.write(backed_pth, as_dense=as_dense) backed.file.close() - from_backed = ad.read(backed_pth) + from_backed = ad.read_h5ad(backed_pth) assert np.all(asarray(orig.X) == asarray(from_backed.X)) @@ -192,8 +193,8 @@ def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2): var_idx = subset_func2(mem_adata.var_names) if ( array_type is asarray - and isinstance(obs_idx, (np.ndarray, sparse.spmatrix)) - and isinstance(var_idx, (np.ndarray, sparse.spmatrix)) + and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix)) + and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix)) ): pytest.xfail( "Fancy indexing does not work with multiple arrays on a h5py.Dataset" @@ -303,10 +304,13 @@ def test_backed_modification_sparse(adata, backing_h5ad, sparse_format): assert adata.filename == backing_h5ad assert adata.isbacked - adata.X[0, [0, 2]] = 10 - adata.X[1, [0, 2]] = [11, 12] - with pytest.raises(ValueError): - adata.X[2, 1] = 13 + with pytest.warns( + PendingDeprecationWarning, match=r"__setitem__ will likely be removed" + ): + adata.X[0, [0, 2]] = 10 + adata.X[1, [0, 2]] = [11, 12] + with pytest.raises(ValueError): + adata.X[2, 1] = 13 assert adata.isbacked diff --git a/anndata/tests/test_io_elementwise.py b/anndata/tests/test_io_elementwise.py index 34a42e7ff..7f7dac4dd 100644 --- a/anndata/tests/test_io_elementwise.py +++ b/anndata/tests/test_io_elementwise.py @@ -298,4 +298,34 @@ def test_read_zarr_from_group(tmp_path, consolidated): read_func = zarr.open with read_func(pth) as z: - assert_equal(ad.read_zarr(z["table/table"]), adata) + expected = ad.read_zarr(z["table/table"]) + assert_equal(adata, expected) + + +def test_dataframe_column_uniqueness(store): + repeated_cols = pd.DataFrame(np.ones((3, 2)), columns=["a", "a"]) + + with pytest_8_raises( + ValueError, + match=r"Found repeated column names: \['a'\]\. Column names must be unique\.", + ): + write_elem(store, "repeated_cols", repeated_cols) + + index_shares_col_name = pd.DataFrame( + {"col_name": [1, 2, 3]}, index=pd.Index([1, 3, 2], name="col_name") + ) + + with pytest_8_raises( + ValueError, + match=r"DataFrame\.index\.name \('col_name'\) is also used by a column whose values are different\.", + ): + write_elem(store, "index_shares_col_name", index_shares_col_name) + + index_shared_okay = pd.DataFrame( + {"col_name": [1, 2, 3]}, index=pd.Index([1, 2, 3], name="col_name") + ) + + write_elem(store, "index_shared_okay", index_shared_okay) + result = read_elem(store["index_shared_okay"]) + + assert_equal(result, index_shared_okay) diff --git a/anndata/tests/test_io_utils.py b/anndata/tests/test_io_utils.py index c70091474..803a4ad72 100644 --- a/anndata/tests/test_io_utils.py +++ b/anndata/tests/test_io_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations -from contextlib import suppress +from contextlib import AbstractContextManager, suppress +from typing import TYPE_CHECKING import h5py import pandas as pd @@ -9,13 +10,15 @@ import anndata as ad from anndata._io.specs.registry import IORegistryError -from anndata._io.utils import ( - report_read_key_on_error, -) +from anndata._io.utils import report_read_key_on_error from anndata.compat import _clean_uns from anndata.experimental import read_elem, write_elem from anndata.tests.helpers import pytest_8_raises +if TYPE_CHECKING: + from collections.abc import Callable + from pathlib import Path + @pytest.fixture(params=["h5ad", "zarr"]) def diskfmt(request): @@ -29,20 +32,32 @@ def diskfmt(request): pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) -def test_key_error(tmp_path, group_fn): +@pytest.mark.parametrize("nested", [True, False], ids=["nested", "root"]) +def test_key_error( + tmp_path, group_fn: Callable[[Path], zarr.Group | h5py.Group], nested: bool +): @report_read_key_on_error def read_attr(_): raise NotImplementedError() group = group_fn(tmp_path) - with group if hasattr(group, "__enter__") else suppress(): + with group if isinstance(group, AbstractContextManager) else suppress(): + if nested: + group = group.create_group("nested") + path = "/nested" + else: + path = "/" group["X"] = [1, 2, 3] group.create_group("group") - with pytest_8_raises(NotImplementedError, match=r"/X"): + with pytest_8_raises( + NotImplementedError, match=rf"reading key 'X'.*from {path}$" + ): read_attr(group["X"]) - with pytest_8_raises(NotImplementedError, match=r"/group"): + with pytest_8_raises( + NotImplementedError, match=rf"reading key 'group'.*from {path}$" + ): read_attr(group["group"]) @@ -53,7 +68,9 @@ def test_write_error_info(diskfmt, tmp_path): # Assuming we don't define a writer for tuples a = ad.AnnData(uns={"a": {"b": {"c": (1, 2, 3)}}}) - with pytest_8_raises(IORegistryError, match=r"Error raised while writing key 'c'"): + with pytest_8_raises( + IORegistryError, match=r"Error raised while writing key 'c'.*to /uns/a/b" + ): write(a) @@ -89,7 +106,7 @@ class Foo: # (?!...) is a negative lookahead # (?s) enables the dot to match newlines # https://stackoverflow.com/a/406408/130164 <- copilot suggested lol - pattern = r"(?s)((?!Error raised while writing key '/?a').)*$" + pattern = r"(?s)^((?!Error raised while writing key '/?a').)*$" with pytest_8_raises(IORegistryError, match=pattern): write_elem(group, "/", {"a": {"b": Foo()}}) diff --git a/anndata/tests/test_io_warnings.py b/anndata/tests/test_io_warnings.py index dfc33ccf1..29ab2d963 100644 --- a/anndata/tests/test_io_warnings.py +++ b/anndata/tests/test_io_warnings.py @@ -1,10 +1,13 @@ from __future__ import annotations +import re import warnings from importlib.util import find_spec from pathlib import Path +import h5py import pytest +from packaging.version import Version import anndata as ad from anndata.tests.helpers import gen_adata @@ -14,10 +17,26 @@ def test_old_format_warning_thrown(): import scanpy as sc - with pytest.warns(ad._warnings.OldFormatWarning): - pth = Path(sc.datasets.__file__).parent / "10x_pbmc68k_reduced.h5ad" + pth = Path(sc.datasets.__file__).parent / "10x_pbmc68k_reduced.h5ad" + # TODO: with Pytest 8, all this can be a + # `with pytest.warns(...), pytest.warns(...):` + with warnings.catch_warnings(record=True) as record: + warnings.simplefilter("always", ad.OldFormatWarning) + warnings.simplefilter("always", FutureWarning) ad.read_h5ad(pth) + assert any(issubclass(w.category, ad.OldFormatWarning) for w in record), [ + w.message for w in record if not issubclass(w.category, FutureWarning) + ] + assert any( + issubclass(w.category, FutureWarning) + and re.match( + r"Moving element from \.uns\['neighbors']\['distances'] to \.obsp\['distances']\.", + str(w.message), + ) + for w in record + ), [w.message for w in record if not issubclass(w.category, ad.OldFormatWarning)] + def test_old_format_warning_not_thrown(tmp_path): pth = tmp_path / "current.h5ad" @@ -25,7 +44,14 @@ def test_old_format_warning_not_thrown(tmp_path): adata.write_h5ad(pth) with warnings.catch_warnings(record=True) as record: - warnings.simplefilter("always", ad._warnings.OldFormatWarning) + warnings.simplefilter("always", ad.OldFormatWarning) + if Version(h5py.__version__) < Version("3.2"): + # https://github.com/h5py/h5py/issues/1808 + warnings.filterwarnings( + "ignore", + r"Passing None into shape arguments as an alias for \(\) is deprecated\.", + category=DeprecationWarning, + ) ad.read_h5ad(pth) diff --git a/anndata/tests/test_layers.py b/anndata/tests/test_layers.py index 4b6a7f287..0eadffdeb 100644 --- a/anndata/tests/test_layers.py +++ b/anndata/tests/test_layers.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pytest +from numba.core.errors import NumbaDeprecationWarning from anndata import AnnData, read_h5ad, read_loom from anndata.tests.helpers import gen_typed_df_t2_size @@ -78,7 +79,15 @@ def test_readwrite(backing_h5ad): def test_readwrite_loom(tmp_path): loom_path = tmp_path / "test.loom" adata = AnnData(X=X, layers=dict(L=L.copy())) - adata.write_loom(loom_path) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=NumbaDeprecationWarning) + warnings.filterwarnings( + "ignore", + message=r"datetime.datetime.utcnow\(\) is deprecated", + category=DeprecationWarning, + ) + adata.write_loom(loom_path) adata_read = read_loom(loom_path, X_name="") assert adata.layers.keys() == adata_read.layers.keys() @@ -95,3 +104,16 @@ def test_copy(): bdata = adata.copy() adata.layers["L"] += 10 assert np.all(adata.layers["L"] != bdata.layers["L"]) # 201 + + +def test_shape_error(): + adata = AnnData(X=X) + with pytest.raises( + ValueError, + match=( + r"Value passed for key 'L' is of incorrect shape\. " + r"Values of layers must match dimensions \('obs', 'var'\) of parent\. " + r"Value had shape \(4, 3\) while it should have had \(3, 3\)\." + ), + ): + adata.layers["L"] = np.zeros((X.shape[0] + 1, X.shape[1])) diff --git a/anndata/tests/test_obsmvarm.py b/anndata/tests/test_obsmvarm.py index e1e802a9d..c995cc12a 100644 --- a/anndata/tests/test_obsmvarm.py +++ b/anndata/tests/test_obsmvarm.py @@ -6,7 +6,7 @@ import pytest from scipy import sparse -import anndata +from anndata import AnnData M, N = (100, 100) @@ -19,10 +19,10 @@ def adata(): index=[f"cell{i:03d}" for i in range(N)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) - return anndata.AnnData(X, obs=obs, var=var) + return AnnData(X, obs=obs, var=var) -def test_assignment_dict(adata): +def test_assignment_dict(adata: AnnData): d_obsm = dict( a=pd.DataFrame( dict(a1=np.ones(M), a2=[f"a{i}" for i in range(M)]), @@ -45,7 +45,7 @@ def test_assignment_dict(adata): assert np.all(adata.varm[k] == v) -def test_setting_ndarray(adata): +def test_setting_ndarray(adata: AnnData): adata.obsm["a"] = np.ones((M, 10)) adata.varm["a"] = np.ones((N, 10)) assert np.all(adata.obsm["a"] == np.ones((M, 10))) @@ -63,7 +63,7 @@ def test_setting_ndarray(adata): assert h == joblib.hash(adata) -def test_setting_dataframe(adata): +def test_setting_dataframe(adata: AnnData): obsm_df = pd.DataFrame(dict(b_1=np.ones(M), b_2=["a"] * M), index=adata.obs_names) varm_df = pd.DataFrame(dict(b_1=np.ones(N), b_2=["a"] * N), index=adata.var_names) @@ -83,7 +83,7 @@ def test_setting_dataframe(adata): adata.varm["c"] = bad_varm_df -def test_setting_sparse(adata): +def test_setting_sparse(adata: AnnData): obsm_sparse = sparse.random(M, 100) adata.obsm["a"] = obsm_sparse assert not np.any((adata.obsm["a"] != obsm_sparse).data) @@ -105,7 +105,7 @@ def test_setting_sparse(adata): assert h == joblib.hash(adata) -def test_setting_daskarray(adata): +def test_setting_daskarray(adata: AnnData): import dask.array as da adata.obsm["a"] = da.ones((M, 10)) @@ -125,3 +125,15 @@ def test_setting_daskarray(adata): with pytest.raises(ValueError): adata.varm["b"] = da.ones((int(N * 2), 10)) assert h == joblib.hash(adata) + + +def test_shape_error(adata: AnnData): + with pytest.raises( + ValueError, + match=( + r"Value passed for key 'b' is of incorrect shape\. " + r"Values of obsm must match dimensions \('obs',\) of parent\. " + r"Value had shape \(101,\) while it should have had \(100,\)\." + ), + ): + adata.obsm["b"] = np.zeros((adata.shape[0] + 1, adata.shape[0])) diff --git a/anndata/tests/test_obspvarp.py b/anndata/tests/test_obspvarp.py index 5b3e063d3..c39782e8d 100644 --- a/anndata/tests/test_obspvarp.py +++ b/anndata/tests/test_obspvarp.py @@ -9,7 +9,7 @@ import pytest from scipy import sparse -import anndata +from anndata import AnnData from anndata.tests.helpers import gen_typed_df_t2_size from anndata.utils import asarray @@ -24,10 +24,10 @@ def adata(): index=[f"cell{i:03d}" for i in range(M)], ) var = pd.DataFrame(index=[f"gene{i:03d}" for i in range(N)]) - return anndata.AnnData(X, obs=obs, var=var) + return AnnData(X, obs=obs, var=var) -def test_assigmnent_dict(adata): +def test_assigmnent_dict(adata: AnnData): d_obsp = dict( a=pd.DataFrame(np.ones((M, M)), columns=adata.obs_names, index=adata.obs_names), b=np.zeros((M, M)), @@ -46,7 +46,7 @@ def test_assigmnent_dict(adata): assert np.all(asarray(adata.varp[k]) == asarray(v)) -def test_setting_ndarray(adata): +def test_setting_ndarray(adata: AnnData): adata.obsp["a"] = np.ones((M, M)) adata.varp["a"] = np.ones((N, N)) assert np.all(adata.obsp["a"] == np.ones((M, M))) @@ -64,7 +64,7 @@ def test_setting_ndarray(adata): assert h == joblib.hash(adata) -def test_setting_sparse(adata): +def test_setting_sparse(adata: AnnData): obsp_sparse = sparse.random(M, M) adata.obsp["a"] = obsp_sparse assert not np.any((adata.obsp["a"] != obsp_sparse).data) @@ -95,7 +95,7 @@ def test_setting_sparse(adata): ], ids=["heterogeneous", "homogeneous"], ) -def test_setting_dataframe(adata, field, dim, homogenous, df, dtype): +def test_setting_dataframe(adata: AnnData, field, dim, homogenous, df, dtype): if homogenous: with pytest.warns(UserWarning, match=rf"{field.title()} 'df'.*dtype object"): getattr(adata, field)["df"] = df(dim) @@ -107,7 +107,7 @@ def test_setting_dataframe(adata, field, dim, homogenous, df, dtype): assert np.issubdtype(getattr(adata, field)["df"].dtype, dtype) -def test_setting_daskarray(adata): +def test_setting_daskarray(adata: AnnData): import dask.array as da adata.obsp["a"] = da.ones((M, M)) @@ -127,3 +127,15 @@ def test_setting_daskarray(adata): with pytest.raises(ValueError): adata.varp["b"] = da.ones((N, int(N * 2))) assert h == joblib.hash(adata) + + +def test_shape_error(adata: AnnData): + with pytest.raises( + ValueError, + match=( + r"Value passed for key 'a' is of incorrect shape\. " + r"Values of obsp must match dimensions \('obs', 'obs'\) of parent\. " + r"Value had shape \(201, 200\) while it should have had \(200, 200\)\." + ), + ): + adata.obsp["a"] = np.zeros((adata.shape[0] + 1, adata.shape[0])) diff --git a/anndata/tests/test_raw.py b/anndata/tests/test_raw.py index 7e4689d60..b51376b9a 100644 --- a/anndata/tests/test_raw.py +++ b/anndata/tests/test_raw.py @@ -81,7 +81,7 @@ def test_raw_of_view(adata_raw: ad.AnnData): def test_raw_rw(adata_raw: ad.AnnData, backing_h5ad): adata_raw.write(backing_h5ad) - adata_read = ad.read(backing_h5ad) + adata_read = ad.read_h5ad(backing_h5ad) assert_equal(adata_read, adata_raw, exact=True) @@ -96,7 +96,7 @@ def test_raw_view_rw(adata_raw: ad.AnnData, backing_h5ad): assert_equal(adata_raw_view, adata_raw) with pytest.warns(ImplicitModificationWarning, match="initializing view as actual"): adata_raw_view.write(backing_h5ad) - adata_read = ad.read(backing_h5ad) + adata_read = ad.read_h5ad(backing_h5ad) assert_equal(adata_read, adata_raw_view, exact=True) diff --git a/anndata/tests/test_readwrite.py b/anndata/tests/test_readwrite.py index 98de43a61..4521936dd 100644 --- a/anndata/tests/test_readwrite.py +++ b/anndata/tests/test_readwrite.py @@ -13,6 +13,7 @@ import pandas as pd import pytest import zarr +from numba.core.errors import NumbaDeprecationWarning from scipy.sparse import csc_matrix, csr_matrix import anndata as ad @@ -88,7 +89,7 @@ def rw(backing_h5ad): M, N = 100, 101 orig = gen_adata((M, N)) orig.write(backing_h5ad) - curr = ad.read(backing_h5ad) + curr = ad.read_h5ad(backing_h5ad) return curr, orig @@ -139,7 +140,7 @@ def test_readwrite_kitchensink(tmp_path, storage, typ, backing_h5ad, dataset_kwa if storage == "h5ad": adata_src.write(backing_h5ad, **dataset_kwargs) - adata_mid = ad.read(backing_h5ad) + adata_mid = ad.read_h5ad(backing_h5ad) adata_mid.write(tmp_path / "mid.h5ad", **dataset_kwargs) adata = ad.read_h5ad(tmp_path / "mid.h5ad") else: @@ -179,7 +180,7 @@ def test_readwrite_maintain_X_dtype(typ, backing_h5ad): adata_src = ad.AnnData(X) adata_src.write(backing_h5ad) - adata = ad.read(backing_h5ad) + adata = ad.read_h5ad(backing_h5ad) assert adata.X.dtype == adata_src.X.dtype @@ -212,7 +213,7 @@ def test_readwrite_h5ad_one_dimension(typ, backing_h5ad): adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict) adata_one = adata_src[:, 0].copy() adata_one.write(backing_h5ad) - adata = ad.read(backing_h5ad) + adata = ad.read_h5ad(backing_h5ad) assert adata.shape == (3, 1) assert_equal(adata, adata_one) @@ -224,7 +225,7 @@ def test_readwrite_backed(typ, backing_h5ad): adata_src.filename = backing_h5ad # change to backed mode adata_src.write() - adata = ad.read(backing_h5ad) + adata = ad.read_h5ad(backing_h5ad) assert isinstance(adata.obs["oanno1"].dtype, pd.CategoricalDtype) assert not isinstance(adata.obs["oanno2"].dtype, pd.CategoricalDtype) assert adata.obs.index.tolist() == ["name1", "name2", "name3"] @@ -276,7 +277,7 @@ def test_read_full_io_error(tmp_path, name, read, write): store["obs"].attrs["encoding-type"] = "invalid" with pytest_8_raises( IORegistryError, - match=r"raised while reading key '/obs'", + match=r"raised while reading key 'obs'.*from /$", ) as exc_info: read(path) assert re.search( @@ -324,7 +325,8 @@ def check_compressed(key, value): msg = "\n\t".join(not_compressed) raise AssertionError(f"These elements were not compressed correctly:\n\t{msg}") - assert_equal(adata, ad.read_h5ad(pth)) + expected = ad.read_h5ad(pth) + assert_equal(adata, expected) def test_zarr_compression(tmp_path): @@ -349,7 +351,8 @@ def check_compressed(key, value): msg = "\n\t".join(not_compressed) raise AssertionError(f"These elements were not compressed correctly:\n\t{msg}") - assert_equal(adata, ad.read_zarr(pth)) + expected = ad.read_zarr(pth) + assert_equal(adata, expected) def test_changed_obs_var_names(tmp_path, diskfmt): @@ -388,7 +391,14 @@ def test_readwrite_loom(typ, obsm_mapping, varm_mapping, tmp_path): adata_src.obsm["X_a"] = np.zeros((adata_src.n_obs, 2)) adata_src.varm["X_b"] = np.zeros((adata_src.n_vars, 3)) - adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=NumbaDeprecationWarning) + warnings.filterwarnings( + "ignore", + message=r"datetime.datetime.utcnow\(\) is deprecated", + category=DeprecationWarning, + ) + adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True) adata = ad.read_loom( tmp_path / "test.loom", @@ -422,7 +432,15 @@ def test_readwrite_loom(typ, obsm_mapping, varm_mapping, tmp_path): def test_readloom_deprecations(tmp_path): loom_pth = tmp_path / "test.loom" adata_src = gen_adata((5, 10), obsm_types=[np.ndarray], varm_types=[np.ndarray]) - adata_src.write_loom(loom_pth, write_obsm_varm=True) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=NumbaDeprecationWarning) + warnings.filterwarnings( + "ignore", + message=r"datetime.datetime.utcnow\(\) is deprecated", + category=DeprecationWarning, + ) + adata_src.write_loom(loom_pth, write_obsm_varm=True) # obsm_names -> obsm_mapping obsm_mapping = {"df": adata_src.obs.columns} @@ -430,7 +448,7 @@ def test_readloom_deprecations(tmp_path): depr_result = ad.read_loom(loom_pth, obsm_names=obsm_mapping) actual_result = ad.read_loom(loom_pth, obsm_mapping=obsm_mapping) assert_equal(actual_result, depr_result) - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(ValueError, match="ambiguous"), pytest.warns(FutureWarning): ad.read_loom(loom_pth, obsm_mapping=obsm_mapping, obsm_names=obsm_mapping) # varm_names -> varm_mapping @@ -439,7 +457,7 @@ def test_readloom_deprecations(tmp_path): depr_result = ad.read_loom(loom_pth, varm_names=varm_mapping) actual_result = ad.read_loom(loom_pth, varm_mapping=varm_mapping) assert_equal(actual_result, depr_result) - with pytest.raises(ValueError, match="ambiguous"): + with pytest.raises(ValueError, match="ambiguous"), pytest.warns(FutureWarning): ad.read_loom(loom_pth, varm_mapping=varm_mapping, varm_names=varm_mapping) # positional -> keyword @@ -521,15 +539,9 @@ def hash_dir_contents(dir: Path) -> dict[str, bytes]: marks=pytest.mark.xfail(reason="Loom can’t handle 0×0 matrices"), ), pytest.param(ad.read_zarr, ad._io.write_zarr, "test_empty.zarr"), - pytest.param( - ad.read_zarr, - ad._io.write_zarr, - "test_empty.zip", - marks=pytest.mark.xfail(reason="Zarr zip storage doesn’t seem to work…"), - ), ], ) -def test_readwrite_hdf5_empty(read, write, name, tmp_path): +def test_readwrite_empty(read, write, name, tmp_path): adata = ad.AnnData(uns=dict(empty=np.array([], dtype=float))) write(tmp_path / name, adata) ad_read = read(tmp_path / name) @@ -537,7 +549,13 @@ def test_readwrite_hdf5_empty(read, write, name, tmp_path): def test_read_excel(): - adata = ad.read_excel(HERE / "data/excel.xlsx", "Sheet1", dtype=int) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=r"datetime.datetime.utcnow\(\) is deprecated", + category=DeprecationWarning, + ) + adata = ad.read_excel(HERE / "data/excel.xlsx", "Sheet1", dtype=int) assert adata.X.tolist() == X_list @@ -728,10 +746,13 @@ def test_scanpy_krumsiek11(tmp_path, diskfmt): filepth = tmp_path / f"test.{diskfmt}" import scanpy as sc - orig = sc.datasets.krumsiek11() + # TODO: this should be fixed in scanpy instead + with pytest.warns(UserWarning, match=r"Observation names are not unique"): + orig = sc.datasets.krumsiek11() del orig.uns["highlights"] # Can’t write int keys getattr(orig, f"write_{diskfmt}")(filepth) - read = getattr(ad, f"read_{diskfmt}")(filepth) + with pytest.warns(UserWarning, match=r"Observation names are not unique"): + read = getattr(ad, f"read_{diskfmt}")(filepth) assert_equal(orig, read, exact=True) diff --git a/anndata/tests/test_settings.py b/anndata/tests/test_settings.py new file mode 100644 index 000000000..478e6122c --- /dev/null +++ b/anndata/tests/test_settings.py @@ -0,0 +1,227 @@ +from __future__ import annotations + +import os +from enum import Enum + +import pytest + +from anndata._settings import ( + SettingsManager, + check_and_get_bool, + check_and_get_environ_var, + validate_bool, +) + +option = "test_var" +default_val = False +description = "My doc string!" + +option_2 = "test_var_2" +default_val_2 = False +description_2 = "My doc string 2!" + +option_3 = "test_var_3" +default_val_3 = [1, 2] +description_3 = "My doc string 3!" +type_3 = list[int] + + +def validate_int_list(val) -> bool: + if not isinstance(val, list) or not [isinstance(type(e), int) for e in val]: + raise TypeError(f"{repr(val)} is not a valid int list") + return True + + +settings = SettingsManager() +settings.register(option, default_val, description, validate_bool) + +settings.register(option_2, default_val_2, description_2, validate_bool) + +settings.register( + option_3, + default_val_3, + description_3, + validate_int_list, + type_3, +) + + +def test_register_option_default(): + assert getattr(settings, option) == default_val + assert description in settings.describe(option) + + +def test_register_with_env(monkeypatch): + with monkeypatch.context() as mp: + option_env = "test_var_env" + default_val_env = False + description_env = "My doc string env!" + option_env_var = "ANNDATA_" + option_env.upper() + mp.setenv(option_env_var, "1") + + settings.register( + option_env, + default_val_env, + description_env, + validate_bool, + get_from_env=check_and_get_bool, + ) + + assert settings.test_var_env + + +def test_register_with_env_enum(monkeypatch): + with monkeypatch.context() as mp: + option_env = "test_var_env" + default_val_env = False + description_env = "My doc string env!" + option_env_var = "ANNDATA_" + option_env.upper() + mp.setenv(option_env_var, "b") + + class TestEnum(Enum): + a = False + b = True + + def check_and_get_bool_enum(option, default_value): + return check_and_get_environ_var( + "ANNDATA_" + option.upper(), "a", cast=TestEnum + ).value + + settings.register( + option_env, + default_val_env, + description_env, + validate_bool, + get_from_env=check_and_get_bool_enum, + ) + + assert settings.test_var_env + + +def test_register_bad_option(): + with pytest.raises(TypeError, match="'foo' is not a valid int list"): + settings.register( + "test_var_4", + "foo", # should be a list of ints + description_3, + validate_int_list, + type_3, + ) + + +def test_set_option(): + setattr(settings, option, not default_val) + assert getattr(settings, option) == (not default_val) + settings.reset(option) + assert getattr(settings, option) == default_val + + +def test_dir(): + assert {option, option_2, option_3} <= set(dir(settings)) + assert dir(settings) == sorted(dir(settings)) + + +def test_reset_multiple(): + setattr(settings, option, not default_val) + setattr(settings, option_2, not default_val_2) + settings.reset([option, option_2]) + assert getattr(settings, option) == default_val + assert getattr(settings, option_2) == default_val_2 + + +def test_get_unregistered_option(): + with pytest.raises(AttributeError): + setattr(settings, option + "_different", default_val) + + +def test_override(): + with settings.override(**{option: not default_val}): + assert getattr(settings, option) == (not default_val) + assert getattr(settings, option) == default_val + + +def test_override_multiple(): + with settings.override(**{option: not default_val, option_2: not default_val_2}): + assert getattr(settings, option) == (not default_val) + assert getattr(settings, option_2) == (not default_val_2) + assert getattr(settings, option) == default_val + assert getattr(settings, option_2) == default_val_2 + + +def test_deprecation(): + warning = "This is a deprecation warning!" + version = "0.1.0" + settings.deprecate(option, version, warning) + described_option = settings.describe(option, print_description=False) + # first line is message, second two from deprecation + default_deprecation_message = f"{option} will be removed in {version}.*" + assert described_option.endswith(default_deprecation_message) + described_option = ( + described_option.rstrip().removesuffix(default_deprecation_message).rstrip() + ) + assert described_option.endswith(warning) + with pytest.warns( + DeprecationWarning, + match="'test_var' will be removed in 0.1.0. This is a deprecation warning!", + ): + assert getattr(settings, option) == default_val + + +def test_deprecation_no_message(): + version = "0.1.0" + settings.deprecate(option, version) + described_option = settings.describe(option, print_description=False) + # first line is message, second from deprecation version + assert described_option.endswith(f"{option} will be removed in {version}.*") + + +def test_option_typing(): + assert settings._registered_options[option_3].type == type_3 + assert str(type_3) in settings.describe(option_3, print_description=False) + + +def test_check_and_get_environ_var(monkeypatch): + with monkeypatch.context() as mp: + option_env_var = "ANNDATA_OPTION" + assert hash("foo") == check_and_get_environ_var( + option_env_var, "foo", ["foo", "bar"], lambda x: hash(x) + ) + mp.setenv(option_env_var, "bar") + assert hash("bar") == check_and_get_environ_var( + option_env_var, "foo", ["foo", "bar"], lambda x: hash(x) + ) + mp.setenv(option_env_var, "Not foo or bar") + with pytest.warns( + match=f'Value "{os.environ[option_env_var]}" is not in allowed' + ): + check_and_get_environ_var( + option_env_var, "foo", ["foo", "bar"], lambda x: hash(x) + ) + assert hash("Not foo or bar") == check_and_get_environ_var( + option_env_var, "foo", cast=lambda x: hash(x) + ) + + +def test_check_and_get_bool(monkeypatch): + with monkeypatch.context() as mp: + option_env_var = "ANNDATA_" + option.upper() + assert not check_and_get_bool(option, default_val) + mp.setenv(option_env_var, "1") + assert check_and_get_bool(option, default_val) + mp.setenv(option_env_var, "Not 0 or 1") + with pytest.warns( + match=f'Value "{os.environ[option_env_var]}" is not in allowed' + ): + check_and_get_bool(option, default_val) + + +def test_check_and_get_bool_enum(monkeypatch): + with monkeypatch.context() as mp: + option_env_var = "ANNDATA_" + option.upper() + mp.setenv(option_env_var, "b") + + class TestEnum(Enum): + a = False + b = True + + assert check_and_get_environ_var(option_env_var, "a", cast=TestEnum).value diff --git a/anndata/tests/test_views.py b/anndata/tests/test_views.py index b195c13b4..7ac4cfefc 100644 --- a/anndata/tests/test_views.py +++ b/anndata/tests/test_views.py @@ -27,6 +27,10 @@ ) from anndata.utils import asarray +IGNORE_SPARSE_EFFICIENCY_WARNING = pytest.mark.filterwarnings( + "ignore:Changing the sparsity structure:scipy.sparse.SparseEfficiencyWarning" +) + # ------------------------------------------------------------------------------ # Some test data # ------------------------------------------------------------------------------ @@ -103,7 +107,8 @@ def test_views(): assert adata_subset.is_view # now transition to actual object - adata_subset.obs["foo"] = range(2) + with pytest.warns(ad.ImplicitModificationWarning, match=r".*\.obs.*"): + adata_subset.obs["foo"] = range(2) assert not adata_subset.is_view assert adata_subset.obs["foo"].tolist() == list(range(2)) @@ -134,7 +139,8 @@ def test_modify_view_component(matrix_type, mapping_name): subset = adata[:5, :][:, :5] assert subset.is_view m = getattr(subset, mapping_name)["m"] - m[0, 0] = 100 + with pytest.warns(ad.ImplicitModificationWarning, match=rf".*\.{mapping_name}.*"): + m[0, 0] = 100 assert not subset.is_view assert getattr(subset, mapping_name)["m"][0, 0] == 100 @@ -267,6 +273,7 @@ def test_set_varm(adata): # TODO: Determine if this is the intended behavior, # or just the behaviour we’ve had for a while +@IGNORE_SPARSE_EFFICIENCY_WARNING def test_not_set_subset_X(matrix_type_base, subset_func): adata = ad.AnnData(matrix_type_base(asarray(sparse.random(20, 20)))) init_hash = joblib.hash(adata) @@ -283,7 +290,8 @@ def test_not_set_subset_X(matrix_type_base, subset_func): subset_func(np.arange(subset.X.shape[1])), subset.var_names ) assert subset.is_view - subset.X[:, internal_idx] = 1 + with pytest.warns(ad.ImplicitModificationWarning, match=r".*X.*"): + subset.X[:, internal_idx] = 1 assert not subset.is_view assert not np.any(asarray(adata.X != orig_X_val)) @@ -307,6 +315,7 @@ def tokenize_anndata(adata: ad.AnnData): # TODO: Determine if this is the intended behavior, # or just the behaviour we’ve had for a while +@IGNORE_SPARSE_EFFICIENCY_WARNING def test_not_set_subset_X_dask(matrix_type_no_gpu, subset_func): adata = ad.AnnData(matrix_type_no_gpu(asarray(sparse.random(20, 20)))) init_hash = tokenize(adata) @@ -323,17 +332,19 @@ def test_not_set_subset_X_dask(matrix_type_no_gpu, subset_func): subset_func(np.arange(subset.X.shape[1])), subset.var_names ) assert subset.is_view - subset.X[:, internal_idx] = 1 + with pytest.warns(ad.ImplicitModificationWarning, match=r".*X.*"): + subset.X[:, internal_idx] = 1 assert not subset.is_view assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == tokenize(adata) +@IGNORE_SPARSE_EFFICIENCY_WARNING def test_set_scalar_subset_X(matrix_type, subset_func): adata = ad.AnnData(matrix_type(np.zeros((10, 10)))) orig_X_val = adata.X.copy() - subset_idx = slice_subset(adata.obs_names) + subset_idx = subset_func(adata.obs_names) adata_subset = adata[subset_idx, :] @@ -367,7 +378,8 @@ def test_set_subset_obsm(adata, subset_func): ) assert subset.is_view - subset.obsm["o"][internal_idx] = 1 + with pytest.warns(ad.ImplicitModificationWarning, match=r".*obsm.*"): + subset.obsm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.obsm["o"] == orig_obsm_val) @@ -389,7 +401,8 @@ def test_set_subset_varm(adata, subset_func): ) assert subset.is_view - subset.varm["o"][internal_idx] = 1 + with pytest.warns(ad.ImplicitModificationWarning, match=r".*varm.*"): + subset.varm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.varm["o"] == orig_varm_val) @@ -481,7 +494,8 @@ def test_layers_view(): assert real_hash == joblib.hash(real_adata) assert view_hash == joblib.hash(view_adata) - view_adata.layers["L2"] = L[1:, 1:] + 2 + with pytest.warns(ad.ImplicitModificationWarning, match=r".*layers.*"): + view_adata.layers["L2"] = L[1:, 1:] + 2 assert not view_adata.is_view assert real_hash == joblib.hash(real_adata) @@ -537,6 +551,30 @@ def test_double_index(subset_func, subset_func2): assert np.all(v1.var == v2.var) +def test_view_different_type_indices(matrix_type): + orig = gen_adata((30, 30), X_type=matrix_type) + boolean_array_mask = np.random.randint(0, 2, 30).astype("bool") + boolean_list_mask = boolean_array_mask.tolist() + integer_array_mask = np.where(boolean_array_mask)[0] + integer_list_mask = integer_array_mask.tolist() + + assert_equal(orig[integer_array_mask, :], orig[boolean_array_mask, :]) + assert_equal(orig[integer_list_mask, :], orig[boolean_list_mask, :]) + assert_equal(orig[integer_list_mask, :], orig[integer_array_mask, :]) + assert_equal(orig[:, integer_array_mask], orig[:, boolean_array_mask]) + assert_equal(orig[:, integer_list_mask], orig[:, boolean_list_mask]) + assert_equal(orig[:, integer_list_mask], orig[:, integer_array_mask]) + # check that X element is same independent of access + assert_equal(orig[:, integer_list_mask].X, orig.X[:, integer_list_mask]) + assert_equal(orig[:, boolean_list_mask].X, orig.X[:, boolean_list_mask]) + assert_equal(orig[:, integer_array_mask].X, orig.X[:, integer_array_mask]) + assert_equal(orig[:, integer_list_mask].X, orig.X[:, integer_list_mask]) + assert_equal(orig[integer_list_mask, :].X, orig.X[integer_list_mask, :]) + assert_equal(orig[boolean_list_mask, :].X, orig.X[boolean_list_mask, :]) + assert_equal(orig[integer_array_mask, :].X, orig.X[integer_array_mask, :]) + assert_equal(orig[integer_list_mask, :].X, orig.X[integer_list_mask, :]) + + def test_view_retains_ndarray_subclass(): adata = ad.AnnData(np.zeros((10, 10))) adata.obsm["foo"] = np.zeros((10, 5)).view(NDArraySubclass) @@ -631,7 +669,8 @@ def test_deepcopy_subset(adata, spmat: type): def test_view_mixin_copies_data(adata, array_type: type, attr): N = 100 adata = ad.AnnData( - obs=pd.DataFrame(index=np.arange(N)), var=pd.DataFrame(index=np.arange(N)) + obs=pd.DataFrame(index=np.arange(N).astype(str)), + var=pd.DataFrame(index=np.arange(N).astype(str)), ) X = array_type(sparse.eye(N, N).multiply(np.arange(1, N + 1))) @@ -678,3 +717,32 @@ def test_x_none(): new = view.copy() assert new.shape == (2, 0) assert new.obs_names.tolist() == ["2", "3"] + + +def test_empty_list_subset(): + orig = gen_adata((10, 10)) + subset = orig[:, []] + assert subset.X.shape == (10, 0) + assert subset.obsm["sparse"].shape == (10, 100) + assert subset.varm["sparse"].shape == (0, 100) + + +# @pytest.mark.parametrize("dim", ["obs", "var"]) +# @pytest.mark.parametrize( +# ("idx", "pat"), +# [ +# pytest.param( +# [1, "cell_c"], r"Mixed type list indexers not supported", id="mixed" +# ), +# pytest.param( +# [[1, 2], [2]], r"setting an array element with a sequence", id="nested" +# ), +# ], +# ) +# def test_subset_errors(dim, idx, pat): +# orig = gen_adata((10, 10)) +# with pytest.raises(ValueError, match=pat): +# if dim == "obs": +# orig[idx, :].X +# elif dim == "var": +# orig[:, idx].X diff --git a/anndata/utils.py b/anndata/utils.py index b5fc5c16c..9c700e28b 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re import warnings from functools import singledispatch, wraps from typing import TYPE_CHECKING, Any @@ -19,6 +20,24 @@ logger = get_logger(__name__) +def import_name(name: str) -> Any: + from importlib import import_module + + parts = name.split(".") + obj = import_module(parts[0]) + for i, name in enumerate(parts[1:]): + try: + obj = import_module(f"{obj.__name__}.{name}") + except ModuleNotFoundError: + break + for name in parts[i + 1 :]: + try: + obj = getattr(obj, name) + except AttributeError: + raise RuntimeError(f"{parts[:i]}, {parts[i+1:]}, {obj} {name}") + return obj + + @singledispatch def asarray(x): """Convert x to a numpy array""" @@ -311,7 +330,19 @@ def convert_dictionary_to_structured_array(source: Mapping[str, Sequence[Any]]): return arr -def deprecated(new_name: str): +def warn_once(msg: str, category: type[Warning], stacklevel: int = 1): + warnings.warn(msg, category, stacklevel=stacklevel) + # Prevent from showing up every time an awkward array is used + # You'd think `'once'` works, but it doesn't at the repl and in notebooks + warnings.filterwarnings("ignore", category=category, message=re.escape(msg)) + + +def deprecated( + new_name: str, + category: type[Warning] = DeprecationWarning, + add_msg: str = "", + hide: bool = True, +): """\ This is a decorator which can be used to mark functions as deprecated. It will result in a warning being emitted @@ -319,20 +350,20 @@ def deprecated(new_name: str): """ def decorator(func): + name = func.__qualname__ + msg = ( + f"Use {new_name} instead of {name}, " + f"{name} is deprecated and will be removed in the future." + ) + if add_msg: + msg += f" {add_msg}" + @wraps(func) def new_func(*args, **kwargs): - # turn off filter - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - f"Use {new_name} instead of {func.__name__}, " - f"{func.__name__} will be removed in the future.", - category=DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter("default", DeprecationWarning) # reset filter + warnings.warn(msg, category=category, stacklevel=2) return func(*args, **kwargs) - setattr(new_func, "__deprecated", True) + setattr(new_func, "__deprecated", (category, msg, hide)) return new_func return decorator @@ -345,13 +376,14 @@ class DeprecationMixinMeta(type): """ def __dir__(cls): - def is_deprecated(attr): + def is_hidden(attr) -> bool: if isinstance(attr, property): attr = attr.fget - return getattr(attr, "__deprecated", False) + _, _, hide = getattr(attr, "__deprecated", (None, None, False)) + return hide return [ item for item in type.__dir__(cls) - if not is_deprecated(getattr(cls, item, None)) + if not is_hidden(getattr(cls, item, None)) ] diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py index 73b6c3ff8..05daf0e81 100644 --- a/benchmarks/benchmarks/sparse_dataset.py +++ b/benchmarks/benchmarks/sparse_dataset.py @@ -4,16 +4,30 @@ import zarr from scipy import sparse +from anndata import AnnData from anndata.experimental import sparse_dataset, write_elem +def make_alternating_mask(n): + mask_alternating = np.ones(10_000, dtype=bool) + for i in range(0, 10_000, n): + mask_alternating[i] = False + return mask_alternating + + class SparseCSRContiguousSlice: params = ( [ (10_000, 10_000), # (10_000, 500) ], - [slice(0, 1000), slice(0, 9000), slice(None, 9000, -1), slice(None, None, 2)], + [ + slice(0, 1000), + slice(0, 9000), + slice(None, 9000, -1), + slice(None, None, 2), + make_alternating_mask(10), + ], ) param_names = ["shape", "slice"] @@ -25,9 +39,16 @@ def setup(self, shape, slice): g = zarr.group() write_elem(g, "X", X) self.x = sparse_dataset(g["X"]) + self.adata = AnnData(self.x) def time_getitem(self, shape, slice): self.x[self.slice] def peakmem_getitem(self, shape, slice): self.x[self.slice] + + def time_getitem_adata(self, shape, slice): + self.adata[self.slice] + + def peakmem_getitem_adata(self, shape, slice): + self.adata[self.slice] diff --git a/ci/gpu_ci.yml b/ci/gpu_ci.yml new file mode 100644 index 000000000..9776ec3d5 --- /dev/null +++ b/ci/gpu_ci.yml @@ -0,0 +1,12 @@ +name: cupy_env +channels: + - nvidia + - conda-forge +dependencies: + - python=3.12 + - cuda-version=11.8 + - cupy + - numba + - pytest + - pytest-cov + - pytest-xdist diff --git a/ci/scripts/min-deps.py b/ci/scripts/min-deps.py new file mode 100755 index 000000000..b3f393ea5 --- /dev/null +++ b/ci/scripts/min-deps.py @@ -0,0 +1,99 @@ +#!python3 +from __future__ import annotations + +import argparse +import sys +from collections import deque +from pathlib import Path +from typing import TYPE_CHECKING + +if sys.version_info >= (3, 11): + import tomllib +else: + import tomli as tomllib + +from packaging.requirements import Requirement +from packaging.version import Version + +if TYPE_CHECKING: + from collections.abc import Generator, Iterable + + +def min_dep(req: Requirement) -> Requirement: + """ + Given a requirement, return the minimum version specifier. + + Example + ------- + + >>> min_dep(Requirement("numpy>=1.0")) + "numpy==1.0" + """ + req_name = req.name + if req.extras: + req_name = f"{req_name}[{','.join(req.extras)}]" + + if not req.specifier: + return Requirement(req_name) + + min_version = Version("0.0.0.a1") + for spec in req.specifier: + if spec.operator in [">", ">=", "~="]: + min_version = max(min_version, Version(spec.version)) + elif spec.operator == "==": + min_version = Version(spec.version) + + return Requirement(f"{req_name}=={min_version}.*") + + +def extract_min_deps( + dependencies: Iterable[Requirement], *, pyproject +) -> Generator[Requirement, None, None]: + dependencies = deque(dependencies) # We'll be mutating this + project_name = pyproject["project"]["name"] + + while len(dependencies) > 0: + req = dependencies.pop() + + # If we are referring to other optional dependency lists, resolve them + if req.name == project_name: + assert req.extras, f"Project included itself as dependency, without specifying extras: {req}" + for extra in req.extras: + extra_deps = pyproject["project"]["optional-dependencies"][extra] + dependencies += map(Requirement, extra_deps) + else: + yield min_dep(req) + + +def main(): + parser = argparse.ArgumentParser( + prog="min-deps", + description="""Parse a pyproject.toml file and output a list of minimum dependencies. + + Output is directly passable to `pip install`.""", + usage="pip install `python min-deps.py pyproject.toml`", + ) + parser.add_argument( + "path", type=Path, help="pyproject.toml to parse minimum dependencies from" + ) + parser.add_argument( + "--extras", type=str, nargs="*", default=(), help="extras to install" + ) + + args = parser.parse_args() + + pyproject = tomllib.loads(args.path.read_text()) + + project_name = pyproject["project"]["name"] + deps = [ + *map(Requirement, pyproject["project"]["dependencies"]), + *(Requirement(f"{project_name}[{extra}]") for extra in args.extras), + ] + + min_deps = extract_min_deps(deps, pyproject=pyproject) + + print(" ".join(map(str, min_deps))) + + +if __name__ == "__main__": + main() diff --git a/conftest.py b/conftest.py index 1825ef24c..578bc71d9 100644 --- a/conftest.py +++ b/conftest.py @@ -4,30 +4,50 @@ # TODO: Fix that, e.g. with the `pytest -p anndata.testing._pytest` pattern. from __future__ import annotations -from typing import TYPE_CHECKING +import re +import warnings +from typing import TYPE_CHECKING, cast import pytest from anndata.compat import chdir +from anndata.utils import import_name if TYPE_CHECKING: + from collections.abc import Generator, Iterable from pathlib import Path -doctest_marker = pytest.mark.usefixtures("doctest_env") +@pytest.fixture(autouse=True) +def _suppress_env_for_doctests(request: pytest.FixtureRequest) -> None: + if isinstance(request.node, pytest.DoctestItem): + request.getfixturevalue("_doctest_env") -@pytest.fixture -def doctest_env(cache: pytest.Cache, tmp_path: Path) -> None: + +@pytest.fixture() +def _doctest_env( + request: pytest.FixtureRequest, cache: pytest.Cache, tmp_path: Path +) -> Generator[None, None, None]: from scanpy import settings + assert isinstance(request.node.parent, pytest.Module) + # request.node.parent is either a DoctestModule or a DoctestTextFile. + # Only DoctestModule has a .obj attribute (the imported module). + if request.node.parent.obj: + func = import_name(request.node.name) + warning_detail: tuple[type[Warning], str, bool] | None + if warning_detail := getattr(func, "__deprecated", None): + cat, msg, _ = warning_detail + warnings.filterwarnings("ignore", category=cat, message=re.escape(msg)) + old_dd, settings.datasetdir = settings.datasetdir, cache.mkdir("scanpy-data") with chdir(tmp_path): yield settings.datasetdir = old_dd -def pytest_itemcollected(item): - """Define behavior of pytest.mark.gpu and doctests.""" +def pytest_itemcollected(item: pytest.Item) -> None: + """Define behavior of pytest.mark.gpu.""" from importlib.util import find_spec is_gpu = len([mark for mark in item.iter_markers(name="gpu")]) > 0 @@ -36,5 +56,49 @@ def pytest_itemcollected(item): pytest.mark.skipif(not find_spec("cupy"), reason="Cupy not installed.") ) - if isinstance(item, pytest.DoctestItem): - item.add_marker(doctest_marker) + +def pytest_addoption(parser: pytest.Parser) -> None: + """Hook to register custom CLI options and config values""" + parser.addoption( + "--strict-warnings", + action="store_true", + default=False, + help="Turn warnings into errors that are not overridden by `filterwarnings` or `filterwarnings_when_strict`.", + ) + + parser.addini( + "filterwarnings_when_strict", + "Filters to apply after `-Werror` when --strict-warnings is active", + type="linelist", + default=[], + ) + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: Iterable[pytest.Item] +): + if not config.getoption("--strict-warnings"): + return + + warning_filters = [ + "error", + *_config_get_strlist(config, "filterwarnings"), + *_config_get_strlist(config, "filterwarnings_when_strict"), + ] + warning_marks = [pytest.mark.filterwarnings(f) for f in warning_filters] + + # Add warning filters defined in the config to all tests items. + # Test items might already have @pytest.mark.filterwarnings applied, + # so we prepend ours to ensure that an item’s explicit filters override these. + # Reversing then individually prepending ensures that the order is preserved. + for item in items: + for mark in reversed(warning_marks): + item.add_marker(mark, append=False) + + +def _config_get_strlist(config: pytest.Config, name: str) -> list[str]: + if strs := config.getini(name): + assert isinstance(strs, list) + assert all(isinstance(item, str) for item in strs) + return cast(list[str], strs) + return [] diff --git a/docs/api.md b/docs/api.md index bf9761be1..fb8f40f93 100644 --- a/docs/api.md +++ b/docs/api.md @@ -142,3 +142,13 @@ Utilities for customizing the IO process: ImplicitModificationWarning ``` + +## Settings + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + + settings + settings.override +``` diff --git a/docs/benchmark-read-write.ipynb b/docs/benchmark-read-write.ipynb index 44356459d..886bfa0f6 100644 --- a/docs/benchmark-read-write.ipynb +++ b/docs/benchmark-read-write.ipynb @@ -20,8 +20,11 @@ "metadata": {}, "outputs": [], "source": [ - "import anndata as ad\n", - "import scanpy as sc" + "from __future__ import annotations\n", + "\n", + "import scanpy as sc\n", + "\n", + "import anndata as ad" ] }, { @@ -84,7 +87,7 @@ ], "source": [ "%%time\n", - "adata.write('test.h5ad')" + "adata.write(\"test.h5ad\")" ] }, { @@ -103,7 +106,7 @@ ], "source": [ "%%time\n", - "adata = ad.read('test.h5ad')" + "adata = ad.read_h5ad(\"test.h5ad\")" ] }, { @@ -129,7 +132,7 @@ ], "source": [ "%%time\n", - "adata.write_loom('test.loom')" + "adata.write_loom(\"test.loom\")" ] }, { @@ -156,7 +159,7 @@ ], "source": [ "%%time\n", - "adata = ad.read_loom('test.loom')" + "adata = ad.read_loom(\"test.loom\")" ] } ], diff --git a/docs/concatenation.rst b/docs/concatenation.rst index 17674188d..be644dceb 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -33,7 +33,7 @@ Let's start off with an example: If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently. - >>> groups = pbmc.obs.groupby("louvain").indices + >>> groups = pbmc.obs.groupby("louvain", observed=True).indices >>> pbmc_concat = ad.concat([pbmc[inds] for inds in groups.values()], merge="same") >>> assert np.array_equal(pbmc.X, pbmc_concat[pbmc.obs_names].X) >>> pbmc_concat @@ -69,7 +69,7 @@ For example, given two anndata objects with differing variables: [0., 1., 0.], [1., 0., 0.]]) -The join argument is used for any element which has both (1) an axis being concatenated and (2) has an axis not being concatenated. +The join argument is used for any element which has both (1) an axis being concatenated and (2) an axis not being concatenated. When concatenating along the `obs` dimension, this means elements of `.X`, `obs`, `.layers`, and `.obsm` will be affected by the choice of `join`. To demonstrate this, let's say we're trying to combine a droplet based experiment with a spatial one. @@ -153,7 +153,7 @@ We provide a few strategies for merging elements aligned to the alternative axes * `None`: No elements aligned to alternative axes are present in the result object. * `"same"`: Elements that are the same in each of the objects. * `"unique"`: Elements for which there is only one possible value. -* `"first"`: The first element seen at each from each position. +* `"first"`: The first element seen in each from each position. * `"only"`: Elements that show up in only one of the objects. We'll show how this works with elements aligned to the alternative axis, and then how merging works with `.uns`. @@ -187,7 +187,7 @@ Now we will split this object by the categorical `"blobs"` and recombine it to i `adatas` is now a list of datasets with disjoint sets of observations and a common set of variables. Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset. -Taking a look at how this effects concatenation: +Taking a look at how this affects concatenation: >>> ad.concat(adatas) AnnData object with n_obs × n_vars = 640 × 30 diff --git a/docs/conf.py b/docs/conf.py index d5c872c60..29491e947 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,9 +15,6 @@ # -- General configuration ------------------------------------------------ - -needs_sphinx = "1.7" # autosummary bugfix - # General information project = "anndata" author = f"{project} developers" @@ -52,6 +49,7 @@ "sphinx_autodoc_typehints", # needs to be after napoleon "sphinx_issues", "sphinx_design", + "sphinx_search.extension", "sphinxext.opengraph", "scanpydoc", # needs to be before linkcode "sphinx.ext.linkcode", @@ -126,11 +124,13 @@ def setup(app: Sphinx): # -- Options for HTML output ---------------------------------------------- -html_theme = "sphinx_book_theme" +# The theme is sphinx-book-theme, with patches for readthedocs-sphinx-search +html_theme = "scanpydoc" html_theme_options = dict( use_repository_button=True, repository_url="https://github.com/scverse/anndata", repository_branch="main", + navigation_with_keys=False, # https://github.com/pydata/pydata-sphinx-theme/issues/1492 ) html_logo = "_static/img/anndata_schema.svg" issues_github_path = "scverse/anndata" diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md index 9843e9c81..3fdc68788 100644 --- a/docs/fileformat-prose.md +++ b/docs/fileformat-prose.md @@ -57,10 +57,10 @@ var Group varm Group ``` --> -In general, `AnnData` objects are comprised of a various types of elements. +In general, `AnnData` objects are comprised of various types of elements. Each element is encoded as either an Array (or Dataset in hdf5 terminology) or a collection of elements (e.g. Group) in the store. -We record the type of an element using the `encoding-type` and `encoding-version` keys in it's attributes. -For example, we can this file represents an `AnnData` object from this metadata: +We record the type of an element using the `encoding-type` and `encoding-version` keys in its attributes. +For example, we can see that this file represents an `AnnData` object from its metadata: ```python >>> dict(store.attrs) @@ -319,7 +319,7 @@ pca/variance_ratio ## String arrays Arrays of strings are handled differently than numeric arrays since numpy doesn't really have a good way of representing arrays of unicode strings. -`anndata` assumes strings are text-like data, so uses a variable length encoding. +`anndata` assumes strings are text-like data, so it uses a variable length encoding. `````{tab-set} diff --git a/docs/release-notes/0.10.1.md b/docs/release-notes/0.10.1.md index 1b83f8906..2b775c696 100644 --- a/docs/release-notes/0.10.1.md +++ b/docs/release-notes/0.10.1.md @@ -1,10 +1,6 @@ -### 0.10.1 {small}`the future` +### 0.10.1 {small}`2023-10-08` ```{rubric} Bugfix ``` -```{rubric} Documentation -``` - -```{rubric} Performance -``` +* Fix `ad.concat` erroring when concatenating a categorical and object column {pr}`1171` {user}`ivirshup` diff --git a/docs/release-notes/0.10.2.md b/docs/release-notes/0.10.2.md new file mode 100644 index 000000000..411d6a072 --- /dev/null +++ b/docs/release-notes/0.10.2.md @@ -0,0 +1,17 @@ +### 0.10.2 {small}`2023-10-11` + +```{rubric} Bugfix +``` + +* Added compatibility layer for packages relying on `anndata._core.sparse_dataset.SparseDataset`. + Note that this API is *deprecated* and new code should use {class}`~anndata.experimental.CSRDataset`, {class}`~anndata.experimental.CSCDataset`, and {func}`~anndata.experimental.sparse_dataset` instead. + {pr}`1185` {user}`ivirshup` +* Handle deprecation warning from `pd.Categorical.map` thrown during `anndata.concat` {pr}`1189` {user}`flying-sheep` {user}`ivirshup` +* Fixed extra steps being included in IO tracebacks {pr}`1193` {user}`flying-sheep` +* `as_dense` argument of `write_h5ad` no longer writes an array without encoding metadata {pr}`1193` {user}`flying-sheep` + + +```{rubric} Performance +``` + +* Improved performance of `concat_on_disk` with dense arrays in some cases {pr}`1169` {user}`selmanozleyen` diff --git a/docs/release-notes/0.10.3.md b/docs/release-notes/0.10.3.md new file mode 100644 index 000000000..9b39925e8 --- /dev/null +++ b/docs/release-notes/0.10.3.md @@ -0,0 +1,14 @@ +### 0.10.3 {small}`2023-10-31` + +```{rubric} Bugfix +``` +* Prevent pandas from causing infinite recursion when setting a slice of a categorical column {pr}`1211` {user}`flying-sheep` + +```{rubric} Documentation +``` +* Stop showing “Support for Awkward Arrays is currently experimental” warnings when + reading, concatenating, slicing, or transposing AnnData objects {pr}`1182` {user}`flying-sheep` + +```{rubric} Other updates +``` +* Fail canary CI job when tests raise unexpected warnings. {pr}`1182` {user}`flying-sheep` diff --git a/docs/release-notes/0.10.4.md b/docs/release-notes/0.10.4.md new file mode 100644 index 000000000..5588f534a --- /dev/null +++ b/docs/release-notes/0.10.4.md @@ -0,0 +1,14 @@ +### 0.10.4 {small}`2024-01-04` + +```{rubric} Bugfix +``` +* Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep` +* `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko` +* `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold` +* `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold` +* `adata.X[...]` fixed for `X` as a `BaseCompressedSparseDataset` with `zarr` backend {pr}`1265` {user}`ilan-gold` +* Improve read/write error reporting {pr}`1273` {user}`flying-sheep` + +```{rubric} Documentation +``` +* Improve aligned mapping error messages {pr}`1252` {user}`flying-sheep` diff --git a/docs/release-notes/0.10.5.md b/docs/release-notes/0.10.5.md new file mode 100644 index 000000000..8ffb759a6 --- /dev/null +++ b/docs/release-notes/0.10.5.md @@ -0,0 +1,19 @@ +### 0.10.5 {small}`2024-01-25` + +```{rubric} Bugfix +``` + +* Fix outer concatenation along variables when only a subset of objects had an entry in layers {pr}`1291` {user}`ivirshup` +* Fix comparison of >2d arrays in `uns` during concatenation {pr}`1300` {user}`ivirshup` +* Fix IO with awkward array version 2.5.2 {pr}`1328` {user}`ivirshup` +* Fix bug (introduced in 0.10.4) where indexing an AnnData with `list[bool]` would return the wrong result {pr}`1332` {user}`ivirshup` + +```{rubric} Documentation +``` +* Re-add search-as-you-type, this time via `readthedocs-sphinx-search` {pr}`1311` {user}`flying-sheep` + +```{rubric} Performance +``` + +* `BaseCompressedSparseDataset`'s `indptr` is cached {pr}`1266` {user}`ilan-gold` +* Improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1233` {user}`ilan-gold` diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md new file mode 100644 index 000000000..e8618118a --- /dev/null +++ b/docs/release-notes/0.10.6.md @@ -0,0 +1,21 @@ +### 0.10.6 {small}`the future` + +```{rubric} Bugfix +``` + +* Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold` +* Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup` +* Bring optimization from {pr}`1233` to indexing on the whole `AnnData` object, not just the sparse dataset itself {pr}`1365` {user}`ilan-gold` +* Fix mean slice length checking to use improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1366` {user}`ilan-gold` + +```{rubric} Documentation +``` + +```{rubric} Performance +``` + +```{rubric} Development +``` + +* `anndata`'s CI now tests against minimum versions of it's dependencies. As a result, several dependencies had their minimum required version bumped. See diff for details {pr}`1314` {user}`ivirshup` +* `anndata` now tests against Python 3.12 {pr}`1373` {user}`ivirshup` diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index 32aabe87a..220e176c4 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -2,6 +2,8 @@ ```{rubric} Features ``` +* Add `settings` object with methods for altering internally-used options, like checking for uniqueness on `obs`' index {pr}`1270` {user}`ilan-gold` +* Add `remove_unused_categories` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1340` {user}`ilan-gold` ```{rubric} Bugfix ``` @@ -11,3 +13,8 @@ ```{rubric} Performance ``` + +```{rubric} Breaking +``` + +* Removed deprecated modules `anndata.core` and `anndata.readwrite` {pr}`1197` {user}`ivirshup` diff --git a/docs/release-notes/0.6.0.md b/docs/release-notes/0.6.0.md index b2cc1b506..ab4316f64 100644 --- a/docs/release-notes/0.6.0.md +++ b/docs/release-notes/0.6.0.md @@ -26,7 +26,7 @@ ### 0.6.0 {small}`1 May, 2018` - compatibility with Seurat converter -- tremendous speedup for {func}`~anndata.AnnData.concatenate` +- tremendous speedup for {meth}`~anndata.AnnData.concatenate` - bug fix for deep copy of unstructured annotation after slicing - bug fix for reading HDF5 stored single-category annotations - `'outer join'` concatenation: adds zeros for concatenation of sparse data and nans for dense data diff --git a/docs/release-notes/release-latest.md b/docs/release-notes/release-latest.md index 5337aa78f..c36f7a8a5 100644 --- a/docs/release-notes/release-latest.md +++ b/docs/release-notes/release-latest.md @@ -5,6 +5,21 @@ ## Version 0.10 +```{include} /release-notes/0.10.6.md +``` + +```{include} /release-notes/0.10.5.md +``` + +```{include} /release-notes/0.10.4.md +``` + +```{include} /release-notes/0.10.3.md +``` + +```{include} /release-notes/0.10.2.md +``` + ```{include} /release-notes/0.10.1.md ``` diff --git a/pyproject.toml b/pyproject.toml index 7bfbe496a..85bd6ac91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,19 +32,20 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Visualization", ] dependencies = [ - # pandas <1.1.1 has pandas/issues/35446 + # pandas <1.4 has pandas/issues/35446 # pandas 2.1.0rc0 has pandas/issues/54622 - "pandas >=1.1.1, !=2.1.0rc0", - "numpy>=1.16.5", # required by pandas 1.x - "scipy>1.4", - "h5py>=3", + "pandas >=1.4, !=2.1.0rc0, !=2.1.2", + "numpy>=1.23", + "scipy>1.8", + "h5py>=3.1", "exceptiongroup; python_version<'3.11'", "natsort", - "packaging>=20", + "packaging>=20.0", "array_api_compat", ] dynamic = ["version"] @@ -64,22 +65,23 @@ dev = [ ] doc = [ "sphinx>=4.4", - "sphinx-book-theme>=1.0.1", + "sphinx-book-theme>=1.1.0", "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", "sphinxext.opengraph", "nbsphinx", - "scanpydoc>=0.9", + "scanpydoc[theme,typehints] >=0.13.4", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks "myst_parser", "sphinx_design>=0.5.0", + "readthedocs-sphinx-search", ] test = [ "loompy>=3.0.5", - "pytest>=6.0", + "pytest>=7.3", "pytest-cov>=2.10", "zarr", "matplotlib", @@ -89,9 +91,11 @@ test = [ "boltons", "scanpy", "httpx", # For data downloading - "dask[array,distributed]", + "dask[array,distributed]>=2022.09.2", "awkward>=2.3", + "pyarrow", "pytest_memray", + "pytest-mock" ] gpu = ["cupy"] @@ -104,26 +108,42 @@ version-file = "anndata/_version.py" [tool.coverage.run] source = ["anndata"] -omit = ["setup.py", "versioneer.py", "anndata/_version.py", "**/test_*.py"] +omit = ["anndata/_version.py", "**/test_*.py"] + +[tool.coverage.report] +exclude_also = [ + "if TYPE_CHECKING:", +] [tool.pytest.ini_options] -addopts = "--doctest-modules" +addopts = [ + "--strict-markers", + "--doctest-modules", +] +filterwarnings = [ + 'ignore:Support for Awkward Arrays is currently experimental', + 'ignore:Outer joins on awkward\.Arrays', + # TODO: replace both lines above with this one once we figured out how prevent ImportPathMismatchError + # 'ignore::anndata._warnings.ExperimentalFeatureWarning', +] +# When `--strict-warnings` is used, all warnings are treated as errors, except those: +filterwarnings_when_strict = [ + "default::anndata._warnings.ImplicitModificationWarning", + "default:Transforming to str index:UserWarning", + "default:(Observation|Variable) names are not unique. To make them unique:UserWarning", + "default::scipy.sparse.SparseEfficiencyWarning", + "default::dask.array.core.PerformanceWarning", +] python_files = "test_*.py" testpaths = ["anndata", "docs/concatenation.rst"] -filterwarnings = ['ignore:X\.dtype being converted to np.float32:FutureWarning'] # For some reason this effects how logging is shown when tests are run xfail_strict = true markers = ["gpu: mark test to run on GPU"] -[tool.ruff] -ignore = [ - # line too long -> we accept long comment lines; black gets rid of long code lines - "E501", - # Do not assign a lambda expression, use a def -> AnnData allows lambda expression assignments, - "E731", - # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation - "E741", -] +[tool.ruff.format] +docstring-code-format = true + +[tool.ruff.lint] select = [ "E", # Error detected by Pycodestyle "F", # Errors detected by Pyflakes @@ -134,10 +154,18 @@ select = [ "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path ] -[tool.ruff.per-file-ignores] +ignore = [ + # line too long -> we accept long comment lines; formatter gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> AnnData allows lambda expression assignments, + "E731", + # allow I, O, l as variable names -> I is the identity matrix, i, j, k, l is reasonable indexing notation + "E741", +] +[tool.ruff.lint.per-file-ignores] # E721 comparing types, but we specifically are checking that we aren't getting subtypes (views) "anndata/tests/test_readwrite.py" = ["E721"] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["anndata"] required-imports = ["from __future__ import annotations"]