From aa0459dd09a3d62adfa2427acb3653b0035d1601 Mon Sep 17 00:00:00 2001 From: Martin Kim <46072231+martinkim0@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:08:47 -0800 Subject: [PATCH] [misc] replace black with ruff format and lint, run autofixes (#1011) * Add ruff-format to pre-commit * [misc] replace black with ruff format and lint, run autofixes * autofixes on tools and notebooks * Update tools/cellxgene_census_builder/src/cellxgene_census_builder/util.py Co-authored-by: Emanuele Bezzi * Update tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py Co-authored-by: Emanuele Bezzi * Address comment * Update api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py Co-authored-by: Emanuele Bezzi * Update api/python/cellxgene_census/src/cellxgene_census/_util.py Co-authored-by: Emanuele Bezzi * format arguments in _release_directory --------- Co-authored-by: Emanuele Bezzi --- .pre-commit-config.yaml | 48 ++-- api/python/cellxgene_census/pyproject.toml | 79 ++++++- .../src/cellxgene_census/__init__.py | 10 +- .../src/cellxgene_census/_experiment.py | 4 +- .../src/cellxgene_census/_get_anndata.py | 12 +- .../src/cellxgene_census/_open.py | 16 +- .../src/cellxgene_census/_presence_matrix.py | 3 +- .../cellxgene_census/_release_directory.py | 196 ++++++++++------ .../src/cellxgene_census/_util.py | 13 +- .../cellxgene_census/experimental/__init__.py | 4 +- .../experimental/_embedding.py | 22 +- .../experimental/ml/__init__.py | 4 +- .../experimental/ml/huggingface/__init__.py | 4 +- .../ml/huggingface/cell_dataset_builder.py | 16 +- .../ml/huggingface/geneformer_tokenizer.py | 19 +- .../experimental/ml/pytorch.py | 82 +++---- .../experimental/pp/__init__.py | 4 +- .../experimental/pp/_highly_variable_genes.py | 46 ++-- .../experimental/pp/_online.py | 45 ++-- .../experimental/pp/_stats.py | 13 +- .../tests/experimental/ml/test_pytorch.py | 49 +++- .../tests/experimental/pp/test_hvg.py | 20 +- .../tests/experimental/pp/test_online.py | 20 +- .../tests/experimental/pp/test_stats.py | 32 ++- .../cellxgene_census/tests/test_directory.py | 16 +- .../tests/test_get_anndata.py | 13 +- .../cellxgene_census/tests/test_lts_compat.py | 18 +- .../cellxgene_census/tests/test_open.py | 72 ++++-- .../comp_bio_data_integration_scvi.ipynb | 18 +- .../comp_bio_embedding_exploration.ipynb | 99 ++++++-- .../comp_bio_explore_and_load_lung_data.ipynb | 5 +- .../comp_bio_geneformer_prediction.ipynb | 31 +-- ...bio_normalizing_full_gene_sequencing.ipynb | 5 +- .../comp_bio_scvi_model_use.ipynb | 18 +- .../census_access_maintained_embeddings.ipynb | 11 +- .../api_demo/census_compute_over_X.ipynb | 28 +-- .../api_demo/census_dataset_presence.ipynb | 2 - .../notebooks/api_demo/census_embedding.ipynb | 17 +- .../experimental/highly_variable_genes.ipynb | 7 +- .../experimental/mean_variance.ipynb | 4 +- api/python/notebooks/experimental/pca.ipynb | 1 - .../notebooks/experimental/pytorch.ipynb | 219 +++++++++++++++--- api/python/notebooks/pyproject.toml | 86 ++++++- tools/cell_dup_check/finddups.ipynb | 32 +-- .../src/cellxgene_census_builder/__main__.py | 26 +-- .../build_soma/anndata.py | 47 ++-- .../build_soma/build_soma.py | 67 +++--- .../build_soma/census_summary.py | 2 +- .../build_soma/consolidate.py | 17 +- .../build_soma/datasets.py | 22 +- .../build_soma/experiment_builder.py | 133 +++++------ .../build_soma/experiment_specs.py | 18 +- .../build_soma/globals.py | 12 +- .../build_soma/manifest.py | 24 +- .../cellxgene_census_builder/build_soma/mp.py | 45 ++-- .../build_soma/schema_util.py | 31 ++- .../build_soma/source_assets.py | 6 +- .../build_soma/stats.py | 15 +- .../build_soma/summary_cell_counts.py | 10 +- .../build_soma/util.py | 24 +- .../build_soma/validate_soma.py | 87 +++---- .../cellxgene_census_builder/build_state.py | 26 +-- .../census_summary.py | 18 +- .../src/cellxgene_census_builder/data_copy.py | 3 +- .../host_validation.py | 22 +- .../src/cellxgene_census_builder/logging.py | 15 +- .../release_cleanup.py | 11 +- .../release_manifest.py | 60 ++--- .../src/cellxgene_census_builder/util.py | 39 ++-- .../tests/anndata/conftest.py | 8 +- .../tests/anndata/test_anndata.py | 24 +- .../tests/conftest.py | 10 +- .../tests/test_builder.py | 19 +- .../tests/test_manifest.py | 25 +- .../tests/test_release_cleanup.py | 26 +-- .../tests/test_release_manifest.py | 5 +- .../tests/test_schema_util.py | 3 +- .../tests/test_source_assets.py | 4 +- .../tests/test_util.py | 3 +- .../tests/test_workflow_steps.py | 3 +- .../src/census_contrib/__main__.py | 21 +- .../census_contrib/src/census_contrib/args.py | 11 +- .../src/census_contrib/census_util.py | 12 +- .../src/census_contrib/config.py | 4 +- .../census_contrib/src/census_contrib/load.py | 19 +- .../src/census_contrib/metadata.py | 29 ++- .../census_contrib/src/census_contrib/save.py | 20 +- .../census_contrib/src/census_contrib/util.py | 48 ++-- .../src/census_contrib/validate.py | 29 +-- .../embeddings_qc_2023-12-15.ipynb | 10 +- .../models/geneformer/finetune-geneformer.py | 4 +- .../generate-geneformer-embeddings.py | 2 +- .../geneformer/helpers/ontology_mapper.py | 61 ++--- .../prepare-census-geneformer-dataset.py | 10 +- tools/models/scvi/scvi-train.py | 2 +- tools/pyproject.toml | 84 ++++++- 96 files changed, 1562 insertions(+), 1177 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10612ecc4..fb5755e32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,33 +1,39 @@ exclude: (^doc/)|(.*/venv/) default_stages: [commit] repos: - - repo: https://github.com/psf/black - rev: 23.12.1 - hooks: - - id: black-jupyter - name: black-notebooks - files: ^api/python/notebooks - args: ["--config", "./api/python/notebooks/pyproject.toml"] - - id: black - name: black-cellxgene-census - files: ^api/python/cellxgene_census - args: ["--config", "./api/python/cellxgene_census/pyproject.toml"] - - id: black - name: black-tools - files: ^tools - args: ["--config", "./tools/pyproject.toml"] - - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.1.15 + rev: v0.2.2 hooks: - id: ruff name: ruff-cellxgene-census files: ^api/python/cellxgene_census + types_or: [python, pyi, jupyter] args: ["--config=./api/python/cellxgene_census/pyproject.toml", "--fix"] - id: ruff name: ruff-tools files: ^tools - args: [ "--config=./tools/pyproject.toml", "--fix" ] + types_or: [python, pyi, jupyter] + args: ["--config=./tools/pyproject.toml", "--fix" ] + - id: ruff + name: ruff-notebooks + files: ^api/python/notebooks + types_or: [python, pyi, jupyter] + args: ["--config=./api/python/notebooks/pyproject.toml", "--fix"] + - id: ruff-format + name: ruff-format-cellxgene-census + files: ^api/python/cellxgene_census + types_or: [python, pyi, jupyter] + args: ["--config=./api/python/cellxgene_census/pyproject.toml"] + - id: ruff-format + name: ruff-format-tools + files: ^tools + types_or: [python, pyi, jupyter] + args: ["--config=./tools/pyproject.toml"] + - id: ruff-format + name: ruff-format-notebooks + files: ^api/python/notebooks + types_or: [python, pyi, jupyter] + args: ["--config=./api/python/notebooks/pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.8.0 @@ -81,12 +87,6 @@ repos: - typing_extensions - types-PyYAML - - repo: https://github.com/nbQA-dev/nbQA - rev: 1.7.1 - hooks: - - id: nbqa-black - files: ^api/python/notebooks - - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.39.0 hooks: diff --git a/api/python/cellxgene_census/pyproject.toml b/api/python/cellxgene_census/pyproject.toml index d24d48346..7d76dddf5 100644 --- a/api/python/cellxgene_census/pyproject.toml +++ b/api/python/cellxgene_census/pyproject.toml @@ -68,9 +68,75 @@ exclude = ["tests*"] # exclude packages matching these glob patterns (empty by [tool.setuptools_scm] root = "../../.." -[tool.black] +[tool.ruff] line-length = 120 -target_version = ['py39'] +src = ["api/python/cellxgene_census/src"] +target-version = "py38" + +[tool.ruff.lint] +select = [ + "F", # Errors detected by Pyflakes + "E", # Error detected by Pycodestyle + "W", # Warning detected by Pycodestyle + "I", # isort + "D", # pydocstyle + "B", # flake8-bugbear + "TID", # flake8-tidy-imports + "C4", # flake8-comprehensions + "BLE", # flake8-blind-except + "UP", # pyupgrade + "RUF100", # Report unused noqa directives +] +ignore = [ + # line too long -> we accept long comment lines; formatter gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E731", + # allow I, O, l as variable names -> I is the identity matrix + "E741", + # Missing docstring in public package + "D104", + # Missing docstring in public module + "D100", + # Missing docstring in __init__ + "D107", + # Errors from function calls in argument defaults. These are fine when the result is immutable. + "B008", + # __magic__ methods are are often self-explanatory, allow missing docstrings + "D105", + # first line should end with a period [Bug: doesn't work with single-line docstrings] + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + ## Disable one in each pair of mutually incompatible rules + # We don’t want a blank line before a class docstring + "D203", + # We want docstrings to start immediately after the opening triple quote + "D213", + # Missing argument description in the docstring TODO: enable + "D417", + # Blank line required between summary line and description TODO: enable + "D205", + # Prefer absolute imports over relative imports from parent modules TODO: enable + "TID252", +] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"*/tests/*" = ["D"] +"*/__init__.py" = ["F401"] + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" [tool.mypy] show_error_codes = true @@ -86,12 +152,3 @@ markers = [ "experimental: tests for the `experimental` package", "lts_compat_check: check for compatibility with an LTS build", ] - -[tool.ruff] -select = ["E", "F", "B", "I"] -ignore = ["E501", "E402", "C408", ] -line-length = 120 -target-version = "py39" - -[tool.ruff.isort] -known-first-party =["cellxgene_census"] diff --git a/api/python/cellxgene_census/src/cellxgene_census/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/__init__.py index 9c34faee4..aed3952b3 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/__init__.py @@ -1,5 +1,4 @@ -""" -An API to facilitate use of the CZI Science CELLxGENE Census. The Census is a versioned container of single-cell data hosted at `CELLxGENE Discover`_. +"""An API to facilitate use of the CZI Science CELLxGENE Census. The Census is a versioned container of single-cell data hosted at `CELLxGENE Discover`_. The API is built on the `tiledbsoma` SOMA API, and provides a number of helper functions including: @@ -23,7 +22,12 @@ from importlib import metadata from ._get_anndata import get_anndata -from ._open import download_source_h5ad, get_default_soma_context, get_source_h5ad_uri, open_soma +from ._open import ( + download_source_h5ad, + get_default_soma_context, + get_source_h5ad_uri, + open_soma, +) from ._presence_matrix import get_presence_matrix from ._release_directory import ( get_census_mirror_directory, diff --git a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py index 5c51e6e42..71c340db6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py @@ -2,8 +2,7 @@ # # Licensed under the MIT License. -""" -Experiments handler. +"""Experiments handler. Contains methods to retrieve SOMA Experiments. """ @@ -34,7 +33,6 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment: maturing Examples: - >>> human = get_experiment(census, "homo sapiens") >>> human = get_experiment(census, "Homo sapiens") diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index cbca2e321..de51ffb42 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -2,7 +2,7 @@ # # Licensed under the MIT License. -"""Get slice as AnnData +"""Get slice as AnnData. Methods to retrieve slices of the census as AnnData objects. """ @@ -28,8 +28,7 @@ def get_anndata( var_coords: Optional[SparseDFCoord] = None, column_names: Optional[soma.AxisColumnNames] = None, ) -> anndata.AnnData: - """ - Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, + """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. Args: @@ -81,4 +80,9 @@ def get_anndata( obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords), var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords), ) as query: - return query.to_anndata(X_name=X_name, column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers) + return query.to_anndata( + X_name=X_name, + column_names=column_names, + X_layers=X_layers, + obsm_layers=obsm_layers, + ) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_open.py b/api/python/cellxgene_census/src/cellxgene_census/_open.py index 61a8ed8e6..e9c4f53dd 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_open.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_open.py @@ -2,7 +2,7 @@ # # Licensed under the MIT License. -"""Open census and related datasets +"""Open census and related datasets. Contains methods to open publicly hosted versions of Census object and access its source datasets. """ @@ -41,8 +41,7 @@ def _assert_mirror_supported(mirror: CensusMirror) -> None: - """ - Verifies if the mirror is supported by this version of the census API. + """Verifies if the mirror is supported by this version of the census API. This method provides a proper error message in case an old version of the census tries to connect to an unsupported mirror. """ @@ -65,12 +64,10 @@ def _resolve_census_locator(locator: CensusLocator, mirror: CensusMirror) -> Res def _open_soma( - locator: ResolvedCensusLocator, context: Optional[soma.options.SOMATileDBContext] = None + locator: ResolvedCensusLocator, + context: Optional[soma.options.SOMATileDBContext] = None, ) -> soma.Collection: - """ - Private. Merge config defaults and return open census as a soma Collection/context. - """ - + """Private. Merge config defaults and return open census as a soma Collection/context.""" # if no user-defined context, cellxgene_census defaults take precedence over SOMA defaults context = context or get_default_soma_context() @@ -98,7 +95,6 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> A :class:``tiledbsoma.SOMATileDBContext` object with sensible defaults. Examples: - To reduce the amount of memory used by TileDB-SOMA I/O operations: .. highlight:: python @@ -122,7 +118,6 @@ def get_default_soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> Lifecycle: experimental """ - tiledb_config = dict(DEFAULT_TILEDB_CONFIGURATION, **(tiledb_config or {})) return soma.options.SOMATileDBContext().replace(tiledb_config=tiledb_config) @@ -209,7 +204,6 @@ def open_soma( >>> with cellxgene_census.open_soma(tiledb_config={"py.init_buffer_bytes": 128 * 1024**2}) as census: ... """ - if tiledb_config is not None and context is not None: raise ValueError("Only one of tiledb_config and context can be specified.") diff --git a/api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py b/api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py index af9783084..4db8a65bb 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py @@ -2,7 +2,7 @@ # # Licensed under the MIT License. -"""Presence matrix methods +"""Presence matrix methods. Methods to retrieve the feature dataset presence matrix. """ @@ -44,7 +44,6 @@ def get_presence_matrix( <321x60554 sparse array of type '' with 6441269 stored elements in Compressed Sparse Row format> """ - exp = _get_experiment(census, organism) presence = exp.ms[measurement_name]["feature_dataset_presence_matrix"] return presence.read((slice(None),)).coos().concat().to_scipy().tocsr() diff --git a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py index fd3219ae3..25465a763 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_release_directory.py @@ -2,7 +2,7 @@ # # Licensed under the MIT License. -"""Versioning of Census builds +"""Versioning of Census builds. Methods to retrieve information about versions of the publicly hosted Census object. """ @@ -18,36 +18,75 @@ to bootstrap all data location requests. """ CensusVersionName = str # census version name, e.g., "release-99", "2022-10-01-test", etc. -CensusLocator = TypedDict( - "CensusLocator", - { - "uri": str, # [deprecated: only used in census < 1.6.0] absolute resource URI. - "relative_uri": str, # resource URI (relative) - "s3_region": Optional[str], # [deprecated: only used in census < 1.6.0] if an S3 URI, has optional region - }, -) -CensusVersionRetraction = TypedDict( - "CensusVersionRetraction", - { - "date": str, # the date of retraction - "reason": Optional[str], # the reason for retraction - "info_url": Optional[str], # a permalink to more information - "replaced_by": Optional[str], # the census version that replaces this one - }, -) + + +class CensusLocator(TypedDict): + """A locator for a Census resource. + + Args: + uri: + Absolute resource URI (deprecated: only used in census < 1.6.0). + relative_uri: + Resource URI (relative). + s3_region: + If an S3 URI, has optional region (deprecated: only used in census < 1.6.0). + """ + + uri: str + relative_uri: str + s3_region: Optional[str] + + +class CensusVersionRetraction(TypedDict): + """A retraction of a Census version. + + Args: + date: + The date of retraction. + reason: + The reason for retraction. + info_url: + A permalink to more information. + replaced_by: + The census version that replaces this one. + """ + + date: str + reason: Optional[str] + info_url: Optional[str] + replaced_by: Optional[str] + + ReleaseFlag = Literal["lts", "retracted"] ReleaseFlags = Dict[ReleaseFlag, bool] -CensusVersionDescription = TypedDict( - "CensusVersionDescription", - { - "release_date": Optional[str], # date of release (deprecated) - "release_build": str, # date of build - "soma": CensusLocator, # SOMA objects locator - "h5ads": CensusLocator, # source H5ADs locator - "flags": NotRequired[ReleaseFlags], # flags for the release - "retraction": NotRequired[CensusVersionRetraction], # if retracted, details of the retraction - }, -) + + +class CensusVersionDescription(TypedDict): + """A description of a Census version. + + Args: + release_date: + The date of the release (deprecated). + release_build: + Date of build. + soma: + SOMA objects locator. + h5ads: + Source H5ADs locator. + flags: + Flags for the release. + retraction: + If retracted, details of the retraction. + """ + + release_date: Optional[str] + release_build: str + soma: CensusLocator + h5ads: CensusLocator + flags: NotRequired[ReleaseFlags] + retraction: NotRequired[CensusVersionRetraction] + + CensusDirectory = Dict[CensusVersionName, Union[CensusVersionName, CensusVersionDescription]] """ @@ -57,44 +96,63 @@ """ Provider = Literal["S3", "file", "unknown"] -""" -A mirror identifies a location that can host the census artifacts. A dict of available mirrors exists -in the mirrors.json file, and looks like this: - -{ - "default": "default-mirror", - "default-mirror": { - "provider": "S3", - "base_uri": "s3://a-public-bucket/", - "region": "us-west-2" - } -} -""" CensusMirrorName = str # name of the mirror -CensusMirror = TypedDict( - "CensusMirror", - { - "provider": Provider, # provider of the mirror. - "base_uri": str, # base URI for the mirror location, e.g. s3://cellxgene-data-public/ - "region": Optional[str], # region of the bucket or resource - }, -) + + +class CensusMirror(TypedDict): + """A mirror for a Census resource. + + A mirror identifies a location that can host the census artifacts. A dict of available mirrors exists in the + ``mirrors.json`` file, and looks like this: + + .. highlight:: json + .. code-block:: json + + { + "default": "default-mirror", + "default-mirror": { + "provider": "S3", + "base_uri": "s3://a-public-bucket/", + "region": "us-west-2" + } + } + + Args: + provider: + Provider of the mirror. + base_uri: + Base URI for the mirror location, e.g. s3://cellxgene-data-public/. + region: + Region of the bucket or resource. + """ + + provider: Provider + base_uri: str + region: Optional[str] + CensusMirrors = Dict[CensusMirrorName, Union[CensusMirrorName, CensusMirror]] -""" -A `ResolvedCensusLocator` represent an absolute location of a Census resource, including the provider info. -It is obtained by resolving a relative location against a specified mirror. -""" -ResolvedCensusLocator = TypedDict( - "ResolvedCensusLocator", - { - "uri": str, # resource URI (absolute) - "region": Optional[str], # if an S3 URI, has optional region - "provider": str, # Provider - }, -) + +class ResolvedCensusLocator(TypedDict): + """A resolved locator for a Census resource. + + A `ResolvedCensusLocator` represent an absolute location of a Census resource, including the provider info. It is + obtained by resolving a relative location against a specified mirror. + + Args: + uri: + Resource URI (absolute). + region: + If an S3 URI, has optional region. + provider: + Provider. + """ + + uri: str + region: Optional[str] + provider: str # URL for the default top-level directory of all public data @@ -140,8 +198,7 @@ def get_census_version_description(census_version: str) -> CensusVersionDescript def get_census_version_directory( *, lts: Optional[bool] = None, retracted: Optional[bool] = False ) -> Dict[CensusVersionName, CensusVersionDescription]: - """ - Get the directory of Census versions currently available, optionally filtering by specified + """Get the directory of Census versions currently available, optionally filtering by specified flags. If a filtering flag is not specified, Census versions will not be filtered by that flag. Defaults to including both "long-term stable" (LTS) and weekly Census versions, and excluding retracted versions. @@ -325,11 +382,9 @@ def get_census_version_directory( census_version_description = cast(CensusVersionDescription, directory_value) release_flags = cast(ReleaseFlags, {"lts": lts, "retracted": retracted}) admitted = all( - [ - census_version_description.get("flags", {}).get(flag_name, False) == release_flags[flag_name] - for flag_name, flag_value in release_flags.items() - if flag_value is not None - ] + census_version_description.get("flags", {}).get(flag_name, False) == release_flags[flag_name] + for flag_name, flag_value in release_flags.items() + if flag_value is not None ) if not admitted: continue @@ -354,8 +409,7 @@ def get_census_version_directory( def get_census_mirror_directory() -> Dict[CensusMirrorName, CensusMirror]: - """ - Get the directory of Census mirrors currently available. + """Get the directory of Census mirrors currently available. Returns: A dictionary that contains mirror names and their corresponding info, diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 29bc540ab..8b7e5685b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -2,14 +2,19 @@ def _uri_join(base: str, url: str) -> str: - """ - like urllib.parse.urljoin, but doesn't get confused by S3:// - """ + """Like urllib.parse.urljoin, but doesn't get confused by s3://.""" p_url = urllib.parse.urlparse(url) if p_url.netloc: return url p_base = urllib.parse.urlparse(base) path = urllib.parse.urljoin(p_base.path, p_url.path) - parts = [p_base.scheme, p_base.netloc, path, p_url.params, p_url.query, p_url.fragment] + parts = [ + p_base.scheme, + p_base.netloc, + path, + p_url.params, + p_url.query, + p_url.fragment, + ] return urllib.parse.urlunparse(parts) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py index 283f7bffe..c37c08789 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py @@ -1,6 +1,4 @@ -""" -Experimental API for the CELLxGENE Disover Census -""" +"""Experimental API for the CELLxGENE Discover Census.""" from ._embedding import get_embedding, get_embedding_metadata diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 08df47847..8586fecd2 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -2,14 +2,12 @@ # # Licensed under the MIT License. -""" -Methods to support simplifed access to community contributed embeddings. -""" +"""Methods to support simplifed access to community contributed embeddings.""" from __future__ import annotations import json import warnings -from typing import Any, Dict, Optional, Union, cast +from typing import Any, Dict, cast import numpy as np import numpy.typing as npt @@ -21,11 +19,8 @@ from .._release_directory import get_census_version_directory -def get_embedding_metadata( - embedding_uri: str, context: Optional[soma.options.SOMATileDBContext] = None -) -> Dict[str, Any]: - """ - Read embedding metadata and return as a Python dict. +def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBContext | None = None) -> dict[str, Any]: + """Read embedding metadata and return as a Python dict. Args: embedding_uri: @@ -41,7 +36,6 @@ def get_embedding_metadata( >>> get_experiment_metadata(uri) """ - # Allow the user to override context for exceptional cases (e.g. the aws region) context = context or get_default_soma_context() @@ -56,11 +50,10 @@ def get_embedding_metadata( def get_embedding( census_version: str, embedding_uri: str, - obs_soma_joinids: Union[npt.NDArray[np.int64], pa.Array], - context: Optional[soma.options.SOMATileDBContext] = None, + obs_soma_joinids: npt.NDArray[np.int64] | pa.Array, + context: soma.options.SOMATileDBContext | None = None, ) -> npt.NDArray[np.float32]: - """ - Read cell (obs) embeddings and return as a dense :class:`numpy.ndarray`. Any cells without + """Read cell (obs) embeddings and return as a dense :class:`numpy.ndarray`. Any cells without an embedding will return NaN values. Args: @@ -98,7 +91,6 @@ def get_embedding( dtype=float32) """ - if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): obs_soma_joinids = obs_soma_joinids.to_numpy() assert isinstance(obs_soma_joinids, np.ndarray) diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py index 942d109ea..99a155bc4 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/__init__.py @@ -1,6 +1,4 @@ -""" -An API to facilitate use of PyTorch ML training with data from the CZI Science CELLxGENE Census. -""" +"""An API to facilitate use of PyTorch ML training with data from the CZI Science CELLxGENE Census.""" from .pytorch import ExperimentDataPipe, Stats, experiment_dataloader diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/__init__.py index a1caa3ec5..52cd50776 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/__init__.py @@ -1,6 +1,4 @@ -""" -An API to facilitate using Hugging Face ML tools with the CZI Science CELLxGENE Census. -""" +"""An API to facilitate using Hugging Face ML tools with the CZI Science CELLxGENE Census.""" from .cell_dataset_builder import CellDatasetBuilder from .geneformer_tokenizer import GeneformerTokenizer diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py index d9ec2e626..6b274e8fd 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/cell_dataset_builder.py @@ -8,8 +8,7 @@ class CellDatasetBuilder(ExperimentAxisQuery[Experiment], ABC): # type: ignore - """ - Abstract base class for methods to process CELLxGENE Census ExperimentAxisQuery + """Abstract base class for methods to process CELLxGENE Census ExperimentAxisQuery results into a Hugging Face Dataset in which each item represents one cell. Subclasses implement the `cell_item()` method to process each row of an X layer into a Dataset item, and may also override `__init__()` and context `__enter__()` @@ -41,8 +40,7 @@ def __init__( block_size: Optional[int] = None, **kwargs: Any, ): - """ - Initialize the CellDatasetBuilder to process the results of a Census + """Initialize the CellDatasetBuilder to process the results of a Census ExperimentAxisQuery. - `experiment`: Census Experiment to be queried. @@ -58,8 +56,7 @@ def __init__( self.block_size = block_size def build(self, from_generator_kwargs: Optional[Dict[str, Any]] = None) -> Dataset: - """ - Build the dataset from query results + """Build the dataset from query results. - `from_generator_kwargs`: kwargs passed through to `Dataset.from_generator()` """ @@ -76,8 +73,7 @@ def gen() -> Generator[Dict[str, Any], None, None]: @abstractmethod def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: - """ - Abstract method to process the X row for one cell into a Dataset item. + """Abstract method to process the X row for one cell into a Dataset item. - `cell_joinid`: The cell `soma_joinid`. - `Xrow`: The `X` row for this cell. This csr_matrix has a single row 0, equal @@ -87,9 +83,7 @@ def cell_item(self, cell_joinid: int, Xrow: scipy.sparse.csr_matrix) -> Dict[str class _DatasetGeneratorPickleHack: - """ - SEE: https://github.com/huggingface/datasets/issues/6194 - """ + """SEE: https://github.com/huggingface/datasets/issues/6194.""" def __init__(self, generator: Any, generator_id: Optional[str] = None) -> None: self.generator = generator diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py index 036f942ae..3c8310fe1 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/huggingface/geneformer_tokenizer.py @@ -10,8 +10,7 @@ class GeneformerTokenizer(CellDatasetBuilder): - """ - Generate a Hugging Face `Dataset` containing Geneformer token sequences for each + """Generate a Hugging Face `Dataset` containing Geneformer token sequences for each cell in CELLxGENE Census ExperimentAxisQuery results (human). This class requires the Geneformer package to be installed separately with: @@ -62,8 +61,7 @@ def __init__( gene_median_file: str = "", **kwargs: Any, ) -> None: - """ - - `experiment`: Census Experiment to query + """- `experiment`: Census Experiment to query - `obs_query`: obs AxisQuery defining the set of Census cells to process (default all) - `obs_column_names`: obs dataframe columns (cell metadata) to propagate into attributes of each Dataset item @@ -86,12 +84,14 @@ def __init__( ) def _load_geneformer_data( - self, experiment: tiledbsoma.Experiment, token_dictionary_file: str, gene_median_file: str + self, + experiment: tiledbsoma.Experiment, + token_dictionary_file: str, + gene_median_file: str, ) -> None: - """ - Load (1) the experiment's genes dataframe and (2) Geneformer's static data + """Load (1) the experiment's genes dataframe and (2) Geneformer's static data files for gene tokens and median expression; then, intersect them to compute - self.model_gene_{ids,tokens,medians} + self.model_gene_{ids,tokens,medians}. """ # TODO: this work could be reused for all queries on this experiment @@ -153,8 +153,7 @@ def __enter__(self) -> "GeneformerTokenizer": return self def cell_item(self, cell_joinid: int, cell_Xrow: scipy.sparse.csr_matrix) -> Dict[str, Any]: - """ - Given the expression vector for one cell, compute the Dataset item providing + """Given the expression vector for one cell, compute the Dataset item providing the Geneformer inputs (token sequence and metadata). """ # project cell_Xrow onto model_gene_ids and normalize by row sum. diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py index e94b4b13a..8634214f0 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py @@ -32,14 +32,13 @@ # TODO: Rename to reflect the correct order of the Tensors within the tuple: (X, obs) ObsAndXDatum = Tuple[Tensor, Tensor] -"""Return type of ``ExperimentDataPipe`` that pairs a Tensor of ``obs`` row(s) with a Tensor of ``X`` matrix row(s). +"""Return type of ``ExperimentDataPipe`` that pairs a Tensor of ``obs`` row(s) with a Tensor of ``X`` matrix row(s). The Tensors are rank 1 if ``batch_size`` is 1, otherwise the Tensors are rank 2.""" @define class _SOMAChunk: - """ - Return type of ``_ObsAndXSOMAIterator`` that pairs a chunk of ``obs`` rows with the respective rows from the ``X`` + """Return type of ``_ObsAndXSOMAIterator`` that pairs a chunk of ``obs`` rows with the respective rows from the ``X`` matrix. Lifecycle: @@ -60,8 +59,7 @@ def __len__(self) -> int: @define class Stats: - """ - Statistics about the data retrieved by ``ExperimentDataPipe`` via SOMA API. This is useful for assessing the read + """Statistics about the data retrieved by ``ExperimentDataPipe`` via SOMA API. This is useful for assessing the read throughput of SOMA data. Lifecycle: @@ -97,7 +95,6 @@ def _open_experiment( aws_region: Optional[str] = None, ) -> soma.Experiment: """Internal method for opening a SOMA ``Experiment`` as a context manager.""" - context = get_default_soma_context().replace(tiledb_config={"vfs.s3.region": aws_region} if aws_region else {}) with soma.Experiment.open(uri, context=context) as exp: @@ -106,7 +103,8 @@ def _open_experiment( class _ObsAndXSOMAIterator(Iterator[_SOMAChunk]): """Iterates the SOMA chunks of corresponding ``obs`` and ``X`` data. This is an internal class, - not intended for public use.""" + not intended for public use. + """ X: soma.SparseNDArray """A handle to the full X data of the SOMA ``Experiment``""" @@ -133,7 +131,8 @@ def __init__( @staticmethod def _maybe_local_shuffle_obs_joinids( - obs_joinids_chunked: List[npt.NDArray[np.int64]], shuffle_rng: Optional[Generator] = None + obs_joinids_chunked: List[npt.NDArray[np.int64]], + shuffle_rng: Optional[Generator] = None, ) -> Iterator[npt.NDArray[np.int64]]: return ( shuffle_rng.permutation(obs_joinid_chunk) if shuffle_rng else obs_joinid_chunk @@ -185,7 +184,7 @@ def __next__(self) -> _SOMAChunk: return _SOMAChunk(obs=obs_batch, X=X_batch, stats=stats) -def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any]]: +def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any]]: # noqa: D103 proc = psutil.Process(os.getpid()) pre_gc = proc.memory_full_info(), psutil.virtual_memory(), psutil.swap_memory() @@ -199,8 +198,7 @@ def run_gc() -> Tuple[Tuple[Any, Any, Any], Tuple[Any, Any, Any]]: class _ObsAndXIterator(Iterator[ObsAndXDatum]): - """ - Iterates through a set of ``obs`` and corresponding ``X`` rows, where the rows to be returned are specified by + """Iterates through a set of ``obs`` and corresponding ``X`` rows, where the rows to be returned are specified by the ``obs_tables_iter`` argument. For the specified ``obs` rows, the corresponding ``X`` data is loaded and joined together. It is returned from this iterator as 2-tuples of ``X`` and obs Tensors. @@ -247,8 +245,7 @@ def __init__( self.X_dtype = X.schema[2].type.to_pandas_dtype() def __next__(self) -> ObsAndXDatum: - """Read the next torch batch, possibly across multiple soma chunks""" - + """Read the next torch batch, possibly across multiple soma chunks.""" obs: pd.DataFrame = pd.DataFrame() X: sparse.csr_matrix = sparse.csr_matrix((0, len(self.var_joinids)), dtype=self.X_dtype) @@ -264,7 +261,9 @@ def __next__(self) -> ObsAndXDatum: raise StopIteration obs_encoded = pd.DataFrame( - data={"soma_joinid": obs.index}, columns=["soma_joinid"] + obs.columns.tolist(), dtype=np.int64 + data={"soma_joinid": obs.index}, + columns=["soma_joinid"] + obs.columns.tolist(), + dtype=np.int64, ) # TODO: Encode the entire SOMA chunk at once in _read_partial_torch_batch() for col, enc in self.encoders.items(): @@ -294,8 +293,8 @@ def __next__(self) -> ObsAndXDatum: def _read_partial_torch_batch(self, batch_size: int) -> ObsAndXDatum: """Reads a torch-size batch of data from the current SOMA chunk, returning a torch-size batch whose size may contain fewer rows than the requested ``batch_size``. This can happen when the remaining rows in the current - SOMA chunk are fewer than the requested ``batch_size``.""" - + SOMA chunk are fewer than the requested ``batch_size``. + """ if self.soma_chunk is None or not (0 <= self.i < len(self.soma_chunk)): # GC memory from previous soma_chunk self.soma_chunk = None @@ -328,10 +327,9 @@ def _read_partial_torch_batch(self, batch_size: int) -> ObsAndXDatum: class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsAndXDatum]]): # type: ignore - """ - An :class:`torchdata.datapipes.iter.IterDataPipe` that reads ``obs`` and ``X`` data from a + """An :class:`torchdata.datapipes.iter.IterDataPipe` that reads ``obs`` and ``X`` data from a :class:`tiledbsoma.Experiment`, based upon the specified queries along the ``obs`` and ``var`` axes. Provides an - iterator over these data when the object is passed to Python's built-in ``iter`` function: + iterator over these data when the object is passed to Python's built-in ``iter`` function. >>> for batch in iter(ExperimentDataPipe(...)): X_batch, y_batch = batch @@ -396,8 +394,7 @@ def __init__( soma_chunk_size: Optional[int] = None, use_eager_fetch: bool = True, ) -> None: - """ - Construct a new ``ExperimentDataPipe``. + """Construct a new ``ExperimentDataPipe``. Args: experiment: @@ -503,11 +500,13 @@ def _init(self) -> None: @staticmethod def _subset_ids_to_partition( - ids_chunked: List[npt.NDArray[np.int64]], partition_index: int, num_partitions: int + ids_chunked: List[npt.NDArray[np.int64]], + partition_index: int, + num_partitions: int, ) -> List[npt.NDArray[np.int64]]: """Returns a single partition of the obs_joinids_chunked (a 2D ndarray), based upon the current process's distributed rank and world - size.""" - + size. + """ # subset to a single partition # typing does not reflect that is actually a List of 2D NDArrays partition_indices = np.array_split(range(len(ids_chunked)), num_partitions) @@ -523,7 +522,10 @@ def _subset_ids_to_partition( @staticmethod def _compute_partitions( - loader_partition: int, loader_partitions: int, dist_partition: int, num_dist_partitions: int + loader_partition: int, + loader_partitions: int, + dist_partition: int, + num_dist_partitions: int, ) -> Tuple[int, int]: # NOTE: Can alternately use a `worker_init_fn` to split among workers split workload total_partitions = num_dist_partitions * loader_partitions @@ -586,8 +588,7 @@ def __iter__(self) -> Iterator[ObsAndXDatum]: shuffle_rng=self._shuffle_rng, ) - for datum_ in obs_and_x_iter: - yield datum_ + yield from obs_and_x_iter pytorch_logger.debug( "max process memory usage=" f"{obs_and_x_iter.max_process_mem_usage_bytes / (1024 ** 3):.3f} GiB" @@ -623,8 +624,7 @@ def _build_obs_encoders(self, query: soma.ExperimentAxisQuery) -> Encoders: # TODO: This does not work in multiprocessing mode, as child process's stats are not collected def stats(self) -> Stats: - """ - Get data loading stats for this :class:`cellxgene_census.ml.pytorch.ExperimentDataPipe`. + """Get data loading stats for this :class:`cellxgene_census.ml.pytorch.ExperimentDataPipe`. Returns: The :class:`cellxgene_census.ml.pytorch.Stats` object for this @@ -637,8 +637,7 @@ def stats(self) -> Stats: @property def shape(self) -> Tuple[int, int]: - """ - Get the shape of the data that will be returned by this :class:`cellxgene_census.ml.pytorch.ExperimentDataPipe`. + """Get the shape of the data that will be returned by this :class:`cellxgene_census.ml.pytorch.ExperimentDataPipe`. This is the number of obs (cell) and var (feature) counts in the returned data. If used in multiprocessing mode (i.e. :class:`torch.utils.data.DataLoader` instantiated with num_workers > 0), the obs (cell) count will reflect the size of the partition of the data assigned to the active process. @@ -657,10 +656,10 @@ def shape(self) -> Tuple[int, int]: @property def obs_encoders(self) -> Encoders: - """ - Returns a dictionary of :class:`sklearn.preprocessing.LabelEncoder` objects, keyed on ``obs`` column names, - which were used to encode the ``obs`` column values. These encoders can be used to decode the encoded values as - follows: + """Returns a dictionary of :class:`sklearn.preprocessing.LabelEncoder` objects, keyed on ``obs`` column names, + which were used to encode the ``obs`` column values. + + These encoders can be used to decode the encoded values as follows: >>> exp_data_pipe.obs_encoders[""].inverse_transform(encoded_values) @@ -684,8 +683,7 @@ def experiment_dataloader( num_workers: int = 0, **dataloader_kwargs: Any, ) -> DataLoader: - """ - Factory method for :class:`torch.utils.data.DataLoader`. This method can be used to safely instantiate a + """Factory method for :class:`torch.utils.data.DataLoader`. This method can be used to safely instantiate a :class:`torch.utils.data.DataLoader` that works with :class:`cellxgene_census.ml.pytorch.ExperimentDataPipe`, since some of the :class:`torch.utils.data.DataLoader` constructor parameters are not applicable when using a :class:`torchdata.datapipes.iter.IterDataPipe` (``shuffle``, ``batch_size``, ``sampler``, ``batch_sampler``, @@ -714,8 +712,13 @@ def experiment_dataloader( Lifecycle: experimental """ - - unsupported_dataloader_args = ["shuffle", "batch_size", "sampler", "batch_sampler", "collate_fn"] + unsupported_dataloader_args = [ + "shuffle", + "batch_size", + "sampler", + "batch_sampler", + "collate_fn", + ] if set(unsupported_dataloader_args).intersection(dataloader_kwargs.keys()): raise ValueError(f"The {','.join(unsupported_dataloader_args)} DataLoader params are not supported") @@ -736,6 +739,7 @@ def experiment_dataloader( def _init_multiprocessing() -> None: """Ensures use of "spawn" for starting child processes with multiprocessing. + Forked processes are known to be problematic: https://pytorch.org/docs/stable/notes/multiprocessing.html#avoiding-and-fighting-deadlocks Also, CUDA does not support forked child processes: diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/__init__.py index a52cdc0aa..4fb090a10 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/__init__.py @@ -1,6 +1,4 @@ -""" -API to facilitate preprocessing of SOMA datasets. -""" +"""API to facilitate preprocessing of SOMA datasets.""" from ._highly_variable_genes import get_highly_variable_genes, highly_variable_genes from ._stats import mean_variance diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py index 192d1e33e..aa0b8f34b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_highly_variable_genes.py @@ -2,7 +2,7 @@ import os from concurrent import futures -from typing import Any, Callable, Optional, Sequence, Union, cast +from typing import Any, Callable, Sequence, cast import numpy as np import pandas as pd @@ -18,7 +18,7 @@ Acknowledgements: ScanPy highly variable genes implementation (scanpy.pp.highly_variable_genes), in turn based upon the original implementation in Seurat V3. -Ref: +Ref: * https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.highly_variable_genes.html#scanpy.pp.highly_variable_genes * github.com/scverse/scanpy @@ -39,12 +39,10 @@ def _get_batch_index( query: soma.ExperimentAxisQuery, - batch_key: Union[str, Sequence[str]], - batch_key_func: Optional[Callable[..., Any]] = None, + batch_key: str | Sequence[str], + batch_key_func: Callable[..., Any] | None = None, ) -> pd.Series[Any]: - """ - Return categorical series representing the batch key, with codes that index the key. - """ + """Return categorical series representing the batch key, with codes that index the key.""" if isinstance(batch_key, str): batch_key = [batch_key] batch_key = list(batch_key) @@ -73,12 +71,12 @@ def _get_batch_index( def _highly_variable_genes_seurat_v3( query: soma.ExperimentAxisQuery, - batch_key: Optional[Union[str, Sequence[str]]] = None, - n_top_genes: int = 1000, + batch_key: str | Sequence[str] | None = None, + n_top_genes: int = 1_000, layer: str = "raw", span: float = 0.3, max_loess_jitter: float = 1e-6, - batch_key_func: Optional[Callable[..., Any]] = None, + batch_key_func: Callable[..., Any] | None = None, ) -> pd.DataFrame: try: import skmisc.loess @@ -231,16 +229,15 @@ def _highly_variable_genes_seurat_v3( def highly_variable_genes( query: soma.ExperimentAxisQuery, - n_top_genes: int = 1000, + n_top_genes: int = 1_000, layer: str = "raw", flavor: Literal["seurat_v3"] = "seurat_v3", span: float = 0.3, - batch_key: Optional[Union[str, Sequence[str]]] = None, + batch_key: str | Sequence[str] | None = None, max_loess_jitter: float = 1e-6, - batch_key_func: Optional[Callable[..., Any]] = None, + batch_key_func: Callable[..., Any] | None = None, ) -> pd.DataFrame: - """ - Identify and annotate highly variable genes contained in the query results. + """Identify and annotate highly variable genes contained in the query results. The API is modelled on ScanPy `scanpy.pp.highly_variable_genes` API. Results returned will mimic ScanPy results. The only `flavor` available is the Seurat V3 method, which assumes count data in the X layer. @@ -282,7 +279,6 @@ def highly_variable_genes( Examples: - Fetch :class:`pandas.DataFrame` containing var annotations for the query selection, using ``"dataset_id"`` as ``batch_key``. @@ -322,19 +318,18 @@ def get_highly_variable_genes( organism: str, measurement_name: str = "RNA", X_name: str = "raw", - obs_value_filter: Optional[str] = None, - obs_coords: Optional[SparseDFCoord] = None, - var_value_filter: Optional[str] = None, - var_coords: Optional[SparseDFCoord] = None, - n_top_genes: int = 1000, + obs_value_filter: str | None = None, + obs_coords: SparseDFCoord | None = None, + var_value_filter: str | None = None, + var_coords: SparseDFCoord | None = None, + n_top_genes: int = 1_000, flavor: Literal["seurat_v3"] = "seurat_v3", span: float = 0.3, - batch_key: Optional[Union[str, Sequence[str]]] = None, + batch_key: str | Sequence[str] | None = None, max_loess_jitter: float = 1e-6, - batch_key_func: Optional[Callable[..., Any]] = None, + batch_key_func: Callable[..., Any] | None = None, ) -> pd.DataFrame: - """ - Convience wrapper around :class:`tiledbsoma.Experiment` query and + """Convience wrapper around :class:`tiledbsoma.Experiment` query and :func:`cellxgene_census.experimental.pp.highly_variable_genes` function, to build and execute a query, and annotate the query result genes (``var`` dataframe) based upon variability. @@ -387,7 +382,6 @@ def get_highly_variable_genes( :func:`cellxgene_census.experimental.pp.highly_variable_genes` Examples: - Fetch a :class:`pandas.DataFrame` containing var annotations for a subset of the cells matching the ``obs_value_filter`: diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py index c66a7e78f..2eaf71a7b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_online.py @@ -6,8 +6,7 @@ class MeanVarianceAccumulator: - """ - Online mean/variance for n_variables over n_samples, where the samples are + """Online mean/variance for n_variables over n_samples, where the samples are divided into n_batches (n_batches << n_samples). Accumulates each batch separately. Batches implemented using Chan's parallel adaptation of Welford's online algorithm. @@ -55,7 +54,12 @@ def update( def finalize( self, - ) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64], npt.NDArray[np.float64]]: + ) -> Tuple[ + npt.NDArray[np.float64], + npt.NDArray[np.float64], + npt.NDArray[np.float64], + npt.NDArray[np.float64], + ]: # correct each batch to account for sparsity. # if nnz_only, the correction is not needed as we only do mean/average over nonzero values if not self.nnz_only: @@ -125,11 +129,22 @@ def update( ) -> None: if self.n_batches == 1: assert batch_vec is None - _accum_clipped_counts(self.counts_sum[0], self.squared_counts_sum[0], var_vec, val_vec, self.clip_val[0]) + _accum_clipped_counts( + self.counts_sum[0], + self.squared_counts_sum[0], + var_vec, + val_vec, + self.clip_val[0], + ) else: assert batch_vec is not None _accum_clipped_counts_by_batch( - self.counts_sum, self.squared_counts_sum, batch_vec, var_vec, val_vec, self.clip_val + self.counts_sum, + self.squared_counts_sum, + batch_vec, + var_vec, + val_vec, + self.clip_val, ) def finalize(self) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: @@ -174,8 +189,7 @@ def _mbomv_update_by_batch( u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], ) -> None: - """ - Incrementally accumulate mean and sum of square of distance from mean using + """Incrementally accumulate mean and sum of square of distance from mean using Welford's online method. """ for batch, col, val in zip(batch_vec, var_vec, val_vec): @@ -213,8 +227,7 @@ def _mbomv_update_single_batch( u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], ) -> None: - """ - Incrementally accumulate mean and sum of square of distance from mean using + """Incrementally accumulate mean and sum of square of distance from mean using Welford's online method. """ for col, val in zip(var_vec, val_vec): @@ -226,7 +239,13 @@ def _mbomv_update_single_batch( @numba.jit( - numba.void(numba.int64, numba.int64[:], numba.int32[:, :], numba.float64[:, :], numba.float64[:, :]), + numba.void( + numba.int64, + numba.int64[:], + numba.int32[:, :], + numba.float64[:, :], + numba.float64[:, :], + ), nopython=True, nogil=True, ) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 @@ -237,8 +256,7 @@ def _mbomv_sparse_correct_batches( u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], ) -> None: - """ - Finalize incremental accumulators to account for missing elements (due to sparse + """Finalize incremental accumulators to account for missing elements (due to sparse input). Non-sparse and sparse combined using Chan's parallel adaptation of Welford's. The code assumes the sparse elements are all zero. """ @@ -265,8 +283,7 @@ def _mbomv_combine_batches( u: npt.NDArray[np.float64], M2: npt.NDArray[np.float64], ) -> Tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: - """ - Combine all batches using Chan's parallel adaptation of Welford's. + """Combine all batches using Chan's parallel adaptation of Welford's. Returns tuple of (u, M2). """ diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py index bd1735930..54e498af5 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/pp/_stats.py @@ -1,7 +1,7 @@ from __future__ import annotations from concurrent import futures -from typing import Any, Generator, Tuple +from typing import Any, Generator import numpy as np import numpy.typing as npt @@ -21,8 +21,7 @@ def mean_variance( ddof: int = 1, nnz_only: bool = False, ) -> pd.DataFrame: - """ - Calculate mean and/or variance along the ``obs`` axis from query results. Calculations are done in an accumulative + """Calculate mean and/or variance along the ``obs`` axis from query results. Calculations are done in an accumulative chunked fashion. For the mean and variance calculations, the total number of elements (N) is, by default, the corresponding dimension size: for column-wise calculations (``axis = 0``) N is number of rows, for row-wise calculations (``axis = 1``) N is number of columns. For metrics calculated only on nnz (explicitly stored) values of @@ -54,7 +53,6 @@ def mean_variance( Lifecycle: experimental """ - if axis not in (0, 1): raise ValueError("axis must be 0 or 1") @@ -70,9 +68,12 @@ def mean_variance( n_batches = 1 n_samples = np.array([n_dim_1], dtype=np.int64) - idx = pd.Index(data=query.obs_joinids() if axis == 1 else query.var_joinids(), name="soma_joinid") + idx = pd.Index( + data=query.obs_joinids() if axis == 1 else query.var_joinids(), + name="soma_joinid", + ) - def iterate() -> Generator[Tuple[npt.NDArray[np.int64], Any], None, None]: + def iterate() -> Generator[tuple[npt.NDArray[np.int64], Any], None, None]: with futures.ThreadPoolExecutor(max_workers=1) as pool: # Note: _EagerIterator only supports one thread for arrow_tbl in _EagerIterator(query.X(layer).tables(), pool=pool): dim = idx.get_indexer(arrow_tbl[f"soma_dim_{1-axis}"].to_numpy()) diff --git a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py index f5d982400..f87b282cc 100644 --- a/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py +++ b/api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py @@ -28,7 +28,10 @@ def pytorch_x_value_gen(obs_range: range, var_range: range) -> spmatrix: - occupied_shape = (obs_range.stop - obs_range.start, var_range.stop - var_range.start) + occupied_shape = ( + obs_range.stop - obs_range.start, + var_range.stop - var_range.start, + ) checkerboard_of_ones = coo_matrix(np.indices(occupied_shape).sum(axis=0) % 2) checkerboard_of_ones.row += obs_range.start checkerboard_of_ones.col += var_range.start @@ -360,7 +363,8 @@ def test_encoders(soma_experiment: Experiment) -> None: @pytest.mark.experimental @pytest.mark.skipif( - (sys.version_info.major, sys.version_info.minor) == (3, 9), reason="fails intermittently with OOM error for 3.9" + (sys.version_info.major, sys.version_info.minor) == (3, 9), + reason="fails intermittently with OOM error for 3.9", ) # noinspection PyTestParametrized @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(6, 3, pytorch_x_value_gen)]) @@ -387,7 +391,9 @@ def test_multiprocessing__returns_full_result(soma_experiment: Experiment) -> No @pytest.mark.experimental # noinspection PyTestParametrized @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(6, 3, pytorch_x_value_gen)]) -def test_distributed__returns_data_partition_for_rank(soma_experiment: Experiment) -> None: +def test_distributed__returns_data_partition_for_rank( + soma_experiment: Experiment, +) -> None: """Tests pytorch._partition_obs_joinids() behavior in a simulated PyTorch distributed processing mode, using mocks to avoid having to do real PyTorch distributed setup.""" @@ -401,7 +407,11 @@ def test_distributed__returns_data_partition_for_rank(soma_experiment: Experimen mock_dist_get_world_size.return_value = 3 dp = ExperimentDataPipe( - soma_experiment, measurement_name="RNA", X_name="raw", obs_column_names=["label"], soma_chunk_size=2 + soma_experiment, + measurement_name="RNA", + X_name="raw", + obs_column_names=["label"], + soma_chunk_size=2, ) full_result = list(iter(dp)) @@ -415,7 +425,9 @@ def test_distributed__returns_data_partition_for_rank(soma_experiment: Experimen @pytest.mark.experimental # noinspection PyTestParametrized @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(12, 3, pytorch_x_value_gen)]) -def test_distributed_and_multiprocessing__returns_data_partition_for_rank(soma_experiment: Experiment) -> None: +def test_distributed_and_multiprocessing__returns_data_partition_for_rank( + soma_experiment: Experiment, +) -> None: """Tests pytorch._partition_obs_joinids() behavior in a simulated PyTorch distributed processing mode and DataLoader multiprocessing mode, using mocks to avoid having to do distributed pytorch setup or real DataLoader multiprocessing.""" @@ -433,7 +445,11 @@ def test_distributed_and_multiprocessing__returns_data_partition_for_rank(soma_e mock_dist_get_world_size.return_value = 3 dp = ExperimentDataPipe( - soma_experiment, measurement_name="RNA", X_name="raw", obs_column_names=["label"], soma_chunk_size=2 + soma_experiment, + measurement_name="RNA", + X_name="raw", + obs_column_names=["label"], + soma_chunk_size=2, ) full_result = list(iter(dp)) @@ -461,7 +477,7 @@ def test_experiment_dataloader__non_batched(soma_experiment: Experiment, use_eag use_eager_fetch=use_eager_fetch, ) dl = experiment_dataloader(dp) - torch_data = [row for row in dl] + torch_data = [row for row in dl] # noqa: C416 row = torch_data[0] assert row[0].to_dense().tolist() == [0, 1, 0] @@ -484,7 +500,7 @@ def test_experiment_dataloader__batched(soma_experiment: Experiment, use_eager_f use_eager_fetch=use_eager_fetch, ) dl = experiment_dataloader(dp) - torch_data = [row for row in dl] + torch_data = [row for row in dl] # noqa: C416 batch = torch_data[0] assert batch[0].to_dense().tolist() == [[0, 1, 0], [1, 0, 1], [0, 1, 0]] @@ -515,7 +531,12 @@ def test__X_tensor_dtype_matches_X_matrix(soma_experiment: Experiment, use_eager # noinspection PyTestParametrized,DuplicatedCode @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(10, 1, pytorch_x_value_gen)]) def test__pytorch_splitting(soma_experiment: Experiment) -> None: - dp = ExperimentDataPipe(soma_experiment, measurement_name="RNA", X_name="raw", obs_column_names=["label"]) + dp = ExperimentDataPipe( + soma_experiment, + measurement_name="RNA", + X_name="raw", + obs_column_names=["label"], + ) dp_train, dp_test = dp.random_split(weights={"train": 0.7, "test": 0.3}, seed=1234) dl = experiment_dataloader(dp_train) @@ -528,7 +549,11 @@ def test__pytorch_splitting(soma_experiment: Experiment) -> None: @pytest.mark.parametrize("obs_range,var_range,X_value_gen", [(16, 1, pytorch_seq_x_value_gen)]) def test__shuffle(soma_experiment: Experiment) -> None: dp = ExperimentDataPipe( - soma_experiment, measurement_name="RNA", X_name="raw", obs_column_names=["label"], shuffle=True + soma_experiment, + measurement_name="RNA", + X_name="raw", + obs_column_names=["label"], + shuffle=True, ) all_rows = list(iter(dp)) @@ -559,7 +584,9 @@ def test_experiment_dataloader__multiprocess_dense_matrix__ok() -> None: @pytest.mark.experimental @patch("cellxgene_census.experimental.ml.pytorch.ExperimentDataPipe") -def test_experiment_dataloader__unsupported_params__fails(dummy_exp_data_pipe: ExperimentDataPipe) -> None: +def test_experiment_dataloader__unsupported_params__fails( + dummy_exp_data_pipe: ExperimentDataPipe, +) -> None: with pytest.raises(ValueError): experiment_dataloader(dummy_exp_data_pipe, shuffle=True) with pytest.raises(ValueError): diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_hvg.py b/api/python/cellxgene_census/tests/experimental/pp/test_hvg.py index c98b6749b..8448d6c29 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_hvg.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_hvg.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, Optional, Union +from typing import Any import numpy as np import pandas as pd @@ -73,17 +73,17 @@ def test_hvg_vs_scanpy( obs_value_filter: str, version: str, experiment_name: str, - batch_key: Optional[Union[str, tuple[str], list[str]]], + batch_key: str | tuple[str] | list[str] | None, span: float, small_mem_context: soma.SOMATileDBContext, ) -> None: """Compare results with ScanPy on a couple of simple tests.""" - kwargs: Dict[str, Any] = dict( - n_top_genes=n_top_genes, - batch_key=batch_key, - flavor="seurat_v3", - ) + kwargs: dict[str, Any] = { + "n_top_genes": n_top_genes, + "batch_key": batch_key, + "flavor": "seurat_v3", + } if span is not None: kwargs["span"] = span @@ -220,7 +220,7 @@ def test_get_highly_variable_genes( obs_value_filter: str, batch_key: str, small_mem_context: soma.SOMATileDBContext, - obs_coords: Optional[slice], + obs_coords: slice | None, ) -> None: with cellxgene_census.open_soma(census_version="stable", context=small_mem_context) as census: hvg = get_highly_variable_genes( @@ -274,7 +274,7 @@ def test_max_loess_jitter_error(small_mem_context: soma.SOMATileDBContext) -> No ) def test_hvg_user_defined_batch_key_func( small_mem_context: soma.SOMATileDBContext, - batch_key: Union[None, str, list[str]], + batch_key: str | list[str] | None, ) -> None: if batch_key is None: @@ -283,7 +283,7 @@ def batch_key_func(srs: pd.Series[Any]) -> str: else: if isinstance(batch_key, str): - keys = set([batch_key]) + keys = set([batch_key]) # noqa: C405 else: keys = set(batch_key) diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_online.py b/api/python/cellxgene_census/tests/experimental/pp/test_online.py index e1493e0ae..5a4a8c07a 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_online.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_online.py @@ -4,7 +4,11 @@ import pytest from scipy import sparse -from cellxgene_census.experimental.pp._online import CountsAccumulator, MeanAccumulator, MeanVarianceAccumulator +from cellxgene_census.experimental.pp._online import ( + CountsAccumulator, + MeanAccumulator, + MeanVarianceAccumulator, +) def allclose(a: npt.NDArray[np.float64], b: npt.NDArray[np.float64]) -> bool: @@ -13,7 +17,14 @@ def allclose(a: npt.NDArray[np.float64], b: npt.NDArray[np.float64]) -> bool: @pytest.fixture def matrix(m: int, n: int) -> sparse.coo_matrix: - m = 100 * sparse.random(m, n, density=0.1, format="coo", dtype=np.float32, random_state=np.random.default_rng()) + m = 100 * sparse.random( + m, + n, + density=0.1, + format="coo", + dtype=np.float32, + random_state=np.random.default_rng(), + ) m.row.flags.writeable = False # type: ignore[attr-defined] m.col.flags.writeable = False # type: ignore[attr-defined] m.data.flags.writeable = False # type: ignore[attr-defined] @@ -145,7 +156,10 @@ def test_counts(matrix: sparse.coo_matrix, n_batches: int, stride: int) -> None: for batch in range(n_batches): dense = matrix[batches == batch, :].toarray() assert allclose(counts_sum[batch], np.minimum(dense, clip_val[batch]).sum(axis=0)) - assert allclose(counts_squared_sum[batch], (np.minimum(dense, clip_val[batch]) ** 2).sum(axis=0)) + assert allclose( + counts_squared_sum[batch], + (np.minimum(dense, clip_val[batch]) ** 2).sum(axis=0), + ) @pytest.mark.experimental diff --git a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py index c31aa7e81..3c113ea07 100644 --- a/api/python/cellxgene_census/tests/experimental/pp/test_stats.py +++ b/api/python/cellxgene_census/tests/experimental/pp/test_stats.py @@ -31,8 +31,18 @@ def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int [ ("mus_musculus", 'tissue_general == "liver" and is_primary_data == True', ()), ("mus_musculus", 'is_primary_data == True and tissue_general == "heart"', ()), - pytest.param("mus_musculus", "is_primary_data == True", (slice(0, 400_000),), marks=pytest.mark.expensive), - pytest.param("homo_sapiens", "is_primary_data == True", (slice(0, 400_000),), marks=pytest.mark.expensive), + pytest.param( + "mus_musculus", + "is_primary_data == True", + (slice(0, 400_000),), + marks=pytest.mark.expensive, + ), + pytest.param( + "homo_sapiens", + "is_primary_data == True", + (slice(0, 400_000),), + marks=pytest.mark.expensive, + ), ], ) def test_mean_variance( @@ -46,10 +56,14 @@ def test_mean_variance( ) -> None: with cellxgene_census.open_soma(census_version="latest", context=small_mem_context) as census: with census["census_data"][experiment_name].axis_query( - measurement_name="RNA", obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords) + measurement_name="RNA", + obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords), ) as query: mean_variance = pp.mean_variance( - query, calculate_mean=calc_mean, calculate_variance=calc_variance, axis=axis + query, + calculate_mean=calc_mean, + calculate_variance=calc_variance, + axis=axis, ) assert isinstance(mean_variance, pd.DataFrame) if calc_mean: @@ -114,7 +128,12 @@ def test_mean_variance_nnz_only( measurement_name="RNA", obs_query=soma.AxisQuery(coords=obs_coords) ) as query: mean_variance = pp.mean_variance( - query, calculate_mean=calc_mean, calculate_variance=calc_variance, axis=axis, nnz_only=True, ddof=0 + query, + calculate_mean=calc_mean, + calculate_variance=calc_variance, + axis=axis, + nnz_only=True, + ddof=0, ) table = query.X("raw").tables().concat() @@ -152,7 +171,8 @@ def test_mean_variance_no_flags() -> None: def test_mean_variance_empty_query(experiment_name: str, small_mem_context: soma.SOMATileDBContext) -> None: with cellxgene_census.open_soma(census_version="latest", context=small_mem_context) as census: with census["census_data"][experiment_name].axis_query( - measurement_name="RNA", obs_query=soma.AxisQuery(value_filter='tissue_general == "foo"') + measurement_name="RNA", + obs_query=soma.AxisQuery(value_filter='tissue_general == "foo"'), ) as query: with pytest.raises(ValueError): pp.mean_variance(query, calculate_mean=True, calculate_variance=True) diff --git a/api/python/cellxgene_census/tests/test_directory.py b/api/python/cellxgene_census/tests/test_directory.py index 4149f688e..9ac52f6ea 100644 --- a/api/python/cellxgene_census/tests/test_directory.py +++ b/api/python/cellxgene_census/tests/test_directory.py @@ -5,7 +5,10 @@ import s3fs import cellxgene_census -from cellxgene_census._release_directory import CELL_CENSUS_MIRRORS_DIRECTORY_URL, CELL_CENSUS_RELEASE_DIRECTORY_URL +from cellxgene_census._release_directory import ( + CELL_CENSUS_MIRRORS_DIRECTORY_URL, + CELL_CENSUS_RELEASE_DIRECTORY_URL, +) # This test fixture contains 3 releases: 1 "latest" and 2 "LTS". Of the "LTS" releases, one is aliased to "stable" # and one is "retracted", and both are aliased with "V#" aliases. The ordering of the releases is @@ -41,7 +44,10 @@ "uri": "s3://cellxgene-data-public/cell-census/2022-09-01/soma/", "s3_region": "us-west-2", }, - "h5ads": {"uri": "s3://cellxgene-data-public/cell-census/2022-09-01/h5ads/", "s3_region": "us-west-2"}, + "h5ads": { + "uri": "s3://cellxgene-data-public/cell-census/2022-09-01/h5ads/", + "s3_region": "us-west-2", + }, }, # Ordered the latest release to be last, to verify it is explicitly sorted "2022-11-01": { @@ -69,7 +75,11 @@ MIRRORS_JSON = { "default": "AWS-S3-us-west-2", - "AWS-S3-us-west-2": {"provider": "S3", "base_uri": "s3://cellxgene-data-public/", "region": "us-west-2"}, + "AWS-S3-us-west-2": { + "provider": "S3", + "base_uri": "s3://cellxgene-data-public/", + "region": "us-west-2", + }, } diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index d0f28143e..9c079cbb7 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -21,7 +21,13 @@ def test_get_anndata_value_filter(census: soma.Collection) -> None: obs_value_filter="tissue_general == 'vasculature'", var_value_filter="feature_name in ['Gm53058', '0610010K14Rik']", column_names={ - "obs": ["soma_joinid", "cell_type", "tissue", "tissue_general", "assay"], + "obs": [ + "soma_joinid", + "cell_type", + "tissue", + "tissue_general", + "assay", + ], "var": ["soma_joinid", "feature_id", "feature_name", "feature_length"], }, ) @@ -37,7 +43,10 @@ def test_get_anndata_value_filter(census: soma.Collection) -> None: def test_get_anndata_coords(census: soma.Collection) -> None: with census: ad = cellxgene_census.get_anndata( - census, organism="Mus musculus", obs_coords=slice(1000), var_coords=slice(2000) + census, + organism="Mus musculus", + obs_coords=slice(1000), + var_coords=slice(2000), ) assert ad is not None diff --git a/api/python/cellxgene_census/tests/test_lts_compat.py b/api/python/cellxgene_census/tests/test_lts_compat.py index acefb01a0..dbe646cdd 100644 --- a/api/python/cellxgene_census/tests/test_lts_compat.py +++ b/api/python/cellxgene_census/tests/test_lts_compat.py @@ -9,7 +9,7 @@ from __future__ import annotations from collections import deque -from typing import Iterator, Literal, Optional, Sequence, Union, get_args +from typing import Iterator, Literal, Sequence, Union, get_args import pyarrow as pa import pytest @@ -18,17 +18,27 @@ import cellxgene_census SOMATypeNames = Literal[ - "SOMACollection", "SOMAExperiment", "SOMAMeasurement", "SOMADataFrame", "SOMASparseNDArray", "SOMADenseNDArray" + "SOMACollection", + "SOMAExperiment", + "SOMAMeasurement", + "SOMADataFrame", + "SOMASparseNDArray", + "SOMADenseNDArray", ] CollectionTypeNames = ["SOMACollection", "SOMAExperiment", "SOMAMeasurement"] SOMATypes = Union[ - soma.Collection, soma.DataFrame, soma.SparseNDArray, soma.DenseNDArray, soma.Experiment, soma.Measurement + soma.Collection, + soma.DataFrame, + soma.SparseNDArray, + soma.DenseNDArray, + soma.Experiment, + soma.Measurement, ] def walk_census( - census: soma.Collection, filter_types: Optional[Sequence[SOMATypeNames]] = None + census: soma.Collection, filter_types: Sequence[SOMATypeNames] | None = None ) -> Iterator[tuple[str, SOMATypes]]: assert census.soma_type == "SOMACollection" filter_types = filter_types or get_args(SOMATypeNames) diff --git a/api/python/cellxgene_census/tests/test_open.py b/api/python/cellxgene_census/tests/test_open.py index 54622ddcf..391aa4aca 100644 --- a/api/python/cellxgene_census/tests/test_open.py +++ b/api/python/cellxgene_census/tests/test_open.py @@ -67,7 +67,9 @@ def test_open_soma_with_customized_tiledb_config(latest_locator: CensusLocator) @pytest.mark.live_corpus -def test_open_soma_with_customized_plain_soma_context(latest_locator: CensusLocator) -> None: +def test_open_soma_with_customized_plain_soma_context( + latest_locator: CensusLocator, +) -> None: soma_init_buffer_bytes = "221000" timestamp_ms = int(time.time() * 1000) - 10 # don't use exactly current time, as that is the default cfg = { @@ -88,7 +90,9 @@ def test_open_soma_with_customized_plain_soma_context(latest_locator: CensusLoca @pytest.mark.live_corpus -def test_open_soma_with_customized_default_soma_context(latest_locator: CensusLocator) -> None: +def test_open_soma_with_customized_default_soma_context( + latest_locator: CensusLocator, +) -> None: soma_init_buffer_bytes = "221000" timestamp_ms = int(time.time() * 1000) - 10 # don't use exactly current time, as that is the default @@ -113,21 +117,31 @@ def test_open_soma_uri_with_custom_s3_region() -> None: with patch("cellxgene_census._open.soma.open") as m: cellxgene_census.open_soma( - uri="s3://bucket/cell-census/2022-11-01/soma/", tiledb_config={"vfs.s3.region": "region-1"} + uri="s3://bucket/cell-census/2022-11-01/soma/", + tiledb_config={"vfs.s3.region": "region-1"}, ) m.assert_called_once_with( - "s3://bucket/cell-census/2022-11-01/soma/", mode="r", soma_type=soma.Collection, context=ANY + "s3://bucket/cell-census/2022-11-01/soma/", + mode="r", + soma_type=soma.Collection, + context=ANY, ) assert m.call_args[1]["context"].tiledb_config["vfs.s3.region"] == "region-1" -def test_open_soma_census_version_always_uses_mirror_s3_region(requests_mock: rm.Mocker) -> None: +def test_open_soma_census_version_always_uses_mirror_s3_region( + requests_mock: rm.Mocker, +) -> None: assert get_default_soma_context().tiledb_config["vfs.s3.region"] != "mirror-region-1", "test pre-condition" mock_mirrors = { "default": "test-mirror", - "test-mirror": {"provider": "S3", "base_uri": "s3://mirror-bucket/", "region": "mirror-region-1"}, + "test-mirror": { + "provider": "S3", + "base_uri": "s3://mirror-bucket/", + "region": "mirror-region-1", + }, } requests_mock.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL, json=mock_mirrors) @@ -147,7 +161,10 @@ def test_open_soma_census_version_always_uses_mirror_s3_region(requests_mock: rm cellxgene_census.open_soma(census_version="latest") m.assert_called_once_with( - "s3://mirror-bucket/cell-census/2022-11-01/soma/", mode="r", soma_type=soma.Collection, context=ANY + "s3://mirror-bucket/cell-census/2022-11-01/soma/", + mode="r", + soma_type=soma.Collection, + context=ANY, ) assert m.call_args[1]["context"].tiledb_config["vfs.s3.region"] == "mirror-region-1" @@ -156,7 +173,10 @@ def test_open_soma_census_version_always_uses_mirror_s3_region(requests_mock: rm cellxgene_census.open_soma(census_version="latest", tiledb_config={"vfs.s3.region": "region-2"}) m.assert_called_once_with( - "s3://mirror-bucket/cell-census/2022-11-01/soma/", mode="r", soma_type=soma.Collection, context=ANY + "s3://mirror-bucket/cell-census/2022-11-01/soma/", + mode="r", + soma_type=soma.Collection, + context=ANY, ) assert m.call_args[1]["context"].tiledb_config["vfs.s3.region"] == "mirror-region-1" @@ -190,8 +210,16 @@ def test_open_soma_errors(requests_mock: rm.Mocker) -> None: def test_open_soma_uses_correct_mirror(requests_mock: rm.Mocker) -> None: mock_mirrors = { "default": "test-mirror", - "test-mirror": {"provider": "S3", "base_uri": "s3://mirror-bucket-1/", "region": "region-1"}, - "test-mirror-2": {"provider": "S3", "base_uri": "s3://mirror-bucket-2/", "region": "region-2"}, + "test-mirror": { + "provider": "S3", + "base_uri": "s3://mirror-bucket-1/", + "region": "region-1", + }, + "test-mirror-2": { + "provider": "S3", + "base_uri": "s3://mirror-bucket-2/", + "region": "region-2", + }, } requests_mock.get(CELL_CENSUS_MIRRORS_DIRECTORY_URL, json=mock_mirrors) @@ -219,14 +247,24 @@ def test_open_soma_uses_correct_mirror(requests_mock: rm.Mocker) -> None: with patch("cellxgene_census._open._open_soma") as m: cellxgene_census.open_soma() m.assert_called_once_with( - {"uri": "s3://mirror-bucket-1/cell-census/2022-11-01/soma/", "region": "region-1", "provider": "S3"}, None + { + "uri": "s3://mirror-bucket-1/cell-census/2022-11-01/soma/", + "region": "region-1", + "provider": "S3", + }, + None, ) # Verify that the correct mirror is used if a mirror parameter is specified with patch("cellxgene_census._open._open_soma") as m: cellxgene_census.open_soma(mirror="test-mirror-2") m.assert_called_once_with( - {"uri": "s3://mirror-bucket-2/cell-census/2022-11-01/soma/", "region": "region-2", "provider": "S3"}, None + { + "uri": "s3://mirror-bucket-2/cell-census/2022-11-01/soma/", + "region": "region-2", + "provider": "S3", + }, + None, ) # Verify that an error is raised if a non existing mirror is specified @@ -379,7 +417,10 @@ def test_opening_census_without_anon_access_fails_with_bogus_creds() -> None: os.environ["AWS_ACCESS_KEY_ID"] = "fake_id" os.environ["AWS_SECRET_ACCESS_KEY"] = "fake_key" # Passing an empty context - with pytest.raises(tiledb.TileDBError, match=r"The AWS Access Key Id you provided does not exist in our records"): + with pytest.raises( + tiledb.TileDBError, + match=r"The AWS Access Key Id you provided does not exist in our records", + ): cellxgene_census.open_soma(census_version="latest", context=soma.SOMATileDBContext()) @@ -397,7 +438,10 @@ def test_can_open_with_anonymous_access() -> None: def test_get_default_soma_context_tiledb_config_overrides() -> None: context = get_default_soma_context( - tiledb_config={"nondefault.config.option": "true", "vfs.s3.no_sign_request": "false"} + tiledb_config={ + "nondefault.config.option": "true", + "vfs.s3.no_sign_request": "false", + } ) assert context.tiledb_config["nondefault.config.option"] == "true", "adds new option" assert context.tiledb_config["vfs.s3.no_sign_request"] == "false", "overrides existing default" diff --git a/api/python/notebooks/analysis_demo/comp_bio_data_integration_scvi.ipynb b/api/python/notebooks/analysis_demo/comp_bio_data_integration_scvi.ipynb index 00f7301c0..5c325180c 100644 --- a/api/python/notebooks/analysis_demo/comp_bio_data_integration_scvi.ipynb +++ b/api/python/notebooks/analysis_demo/comp_bio_data_integration_scvi.ipynb @@ -57,8 +57,8 @@ ], "source": [ "import cellxgene_census\n", - "import scanpy as sc\n", "import numpy as np\n", + "import scanpy as sc\n", "import scvi\n", "from scipy.sparse import csr_matrix" ] @@ -230,10 +230,15 @@ }, "outputs": [], "source": [ - "tabula_muris_liver_ids = [\"4546e757-34d0-4d17-be06-538318925fcd\", \"6202a243-b713-4e12-9ced-c387f8483dea\"]\n", + "tabula_muris_liver_ids = [\n", + " \"4546e757-34d0-4d17-be06-538318925fcd\",\n", + " \"6202a243-b713-4e12-9ced-c387f8483dea\",\n", + "]\n", "\n", "adata = cellxgene_census.get_anndata(\n", - " census, organism=\"Mus musculus\", obs_value_filter=f\"dataset_id in {tabula_muris_liver_ids}\"\n", + " census,\n", + " organism=\"Mus musculus\",\n", + " obs_value_filter=f\"dataset_id in {tabula_muris_liver_ids}\",\n", ")" ] }, @@ -496,7 +501,12 @@ "outputs": [], "source": [ "sc.pp.highly_variable_genes(\n", - " adata, n_top_genes=1000, flavor=\"seurat_v3\", layer=\"counts\", batch_key=\"dataset_id\", subset=True\n", + " adata,\n", + " n_top_genes=1000,\n", + " flavor=\"seurat_v3\",\n", + " layer=\"counts\",\n", + " batch_key=\"dataset_id\",\n", + " subset=True,\n", ")" ] }, diff --git a/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb b/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb index bf674ec74..70d781730 100644 --- a/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb +++ b/api/python/notebooks/analysis_demo/comp_bio_embedding_exploration.ipynb @@ -66,30 +66,33 @@ "metadata": {}, "outputs": [], "source": [ - "import json\n", - "import cellxgene_census\n", + "import warnings\n", + "from typing import List\n", + "\n", "import anndata\n", + "import cellxgene_census\n", "import numpy as np\n", "import scanpy as sc\n", - "import warnings\n", - "import tiledbsoma as soma\n", - "from scipy import sparse\n", - "from typing import List\n", - "from cellxgene_census.experimental import get_embedding, get_embedding_metadata\n", + "from cellxgene_census.experimental import get_embedding\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "\n", "def generate_umaps_from_embeddings(adata: anndata.AnnData, emb_names: list, metric=\"euclidean\"):\n", - " \"\"\"\n", - " Generate UMAPs from embeddings stored in `adata.obsm`.\n", + " \"\"\"Generate UMAPs from embeddings stored in `adata.obsm`.\n", " `emb_names` is a list that contains keys present in `adata.obsm`.\n", " \"\"\"\n", - "\n", " adata = adata.copy()\n", " for emb_name in emb_names:\n", " print(f\"Generating UMAP for {emb_name}\")\n", - " sc.pp.neighbors(adata, n_neighbors=15, use_rep=emb_name, method=\"umap\", key_added=emb_name, metric=metric)\n", + " sc.pp.neighbors(\n", + " adata,\n", + " n_neighbors=15,\n", + " use_rep=emb_name,\n", + " method=\"umap\",\n", + " key_added=emb_name,\n", + " metric=metric,\n", + " )\n", " sc.tl.umap(adata, neighbors_key=emb_name)\n", " X_emb_name = emb_name if emb_name[:2] == \"X_\" else f\"X_{emb_name}\"\n", " if metric != \"euclidean\":\n", @@ -113,8 +116,7 @@ " census_version: str = None,\n", " experiment_name: str = None,\n", "):\n", - " \"\"\"\n", - " For a given set of Census cell coordinates (soma_joinids)\n", + " \"\"\"For a given set of Census cell coordinates (soma_joinids)\n", " fetch embeddings with TileDBSoma and return the corresponding\n", " AnnData with embeddings slotted in.\n", "\n", @@ -124,7 +126,6 @@ "\n", " Assume that all embeddings provided are coming from the same experiment.\n", " \"\"\"\n", - "\n", " with cellxgene_census.open_soma(census_version=census_version) as census:\n", " print(\"Getting anndata with Census embeddings: \", embedding_names)\n", "\n", @@ -305,8 +306,20 @@ } ], "source": [ - "sc.pl.scatter(adata, basis=\"geneformer_umap\", color=[\"OCA2\", \"KIT\", \"cell_type\"], size=10, use_raw=False)\n", - "sc.pl.scatter(adata, basis=\"scgpt_umap\", color=[\"OCA2\", \"KIT\", \"cell_type\"], size=10, use_raw=False)\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"geneformer_umap\",\n", + " color=[\"OCA2\", \"KIT\", \"cell_type\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"scgpt_umap\",\n", + " color=[\"OCA2\", \"KIT\", \"cell_type\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", "sc.pl.scatter(adata, basis=\"uce_umap\", color=[\"OCA2\", \"KIT\", \"cell_type\"], size=10, use_raw=False)\n", "sc.pl.scatter(adata, basis=\"scvi_umap\", color=[\"OCA2\", \"KIT\", \"cell_type\"], size=10, use_raw=False)" ] @@ -359,7 +372,13 @@ } ], "source": [ - "sc.pl.scatter(adata, basis=\"geneformer_umap\", color=[\"dataset_id\", \"assay\"], size=10, use_raw=False)\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"geneformer_umap\",\n", + " color=[\"dataset_id\", \"assay\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", "sc.pl.scatter(adata, basis=\"scgpt_umap\", color=[\"dataset_id\", \"assay\"], size=10, use_raw=False)\n", "sc.pl.scatter(adata, basis=\"uce_umap\", color=[\"dataset_id\", \"assay\"], size=10, use_raw=False)\n", "sc.pl.scatter(adata, basis=\"scvi_umap\", color=[\"dataset_id\", \"assay\"], size=10, use_raw=False)" @@ -543,10 +562,12 @@ "for i, embedding_i in enumerate(embedding_keys):\n", " for j, embedding_j in enumerate(embedding_keys):\n", " sim_scores_leiden[i, j] = normalized_mutual_info_score(\n", - " adata_rbn.obs[f\"{embedding_i}_leiden\"], adata_rbn.obs[f\"{embedding_j}_leiden\"]\n", + " adata_rbn.obs[f\"{embedding_i}_leiden\"],\n", + " adata_rbn.obs[f\"{embedding_j}_leiden\"],\n", " )\n", " sim_scores_hdbscan[i, j] = normalized_mutual_info_score(\n", - " adata_rbn.obs[f\"{embedding_i}_hdbscan\"], adata_rbn.obs[f\"{embedding_j}_hdbscan\"]\n", + " adata_rbn.obs[f\"{embedding_i}_hdbscan\"],\n", + " adata_rbn.obs[f\"{embedding_j}_hdbscan\"],\n", " )\n", "\n", "sim_scores_leiden_table = pd.DataFrame(data=sim_scores_leiden, index=embedding_keys, columns=embedding_keys)\n", @@ -678,7 +699,13 @@ } ], "source": [ - "sc.pl.scatter(adata, basis=\"geneformer_umap\", color=[\"TH\", \"assay\", \"disease\"], size=10, use_raw=False)\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"geneformer_umap\",\n", + " color=[\"TH\", \"assay\", \"disease\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", "sc.pl.scatter(adata, basis=\"scgpt_umap\", color=[\"TH\", \"assay\", \"disease\"], size=10, use_raw=False)\n", "sc.pl.scatter(adata, basis=\"uce_umap\", color=[\"TH\", \"assay\", \"disease\"], size=10, use_raw=False)\n", "sc.pl.scatter(adata, basis=\"scvi_umap\", color=[\"TH\", \"assay\", \"disease\"], size=10, use_raw=False)" @@ -789,10 +816,34 @@ } ], "source": [ - "sc.pl.scatter(adata, basis=\"geneformer_umap\", color=[\"CFTR\", \"assay\", \"cell_type\"], size=10, use_raw=False)\n", - "sc.pl.scatter(adata, basis=\"scgpt_umap\", color=[\"CFTR\", \"assay\", \"cell_type\"], size=10, use_raw=False)\n", - "sc.pl.scatter(adata, basis=\"uce_umap\", color=[\"CFTR\", \"assay\", \"cell_type\"], size=10, use_raw=False)\n", - "sc.pl.scatter(adata, basis=\"scvi_umap\", color=[\"CFTR\", \"assay\", \"cell_type\"], size=10, use_raw=False)" + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"geneformer_umap\",\n", + " color=[\"CFTR\", \"assay\", \"cell_type\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"scgpt_umap\",\n", + " color=[\"CFTR\", \"assay\", \"cell_type\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"uce_umap\",\n", + " color=[\"CFTR\", \"assay\", \"cell_type\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")\n", + "sc.pl.scatter(\n", + " adata,\n", + " basis=\"scvi_umap\",\n", + " color=[\"CFTR\", \"assay\", \"cell_type\"],\n", + " size=10,\n", + " use_raw=False,\n", + ")" ] }, { diff --git a/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb b/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb index 6d16d8d9c..b2d8cfdb2 100644 --- a/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb +++ b/api/python/notebooks/analysis_demo/comp_bio_explore_and_load_lung_data.ipynb @@ -47,10 +47,9 @@ ], "source": [ "import cellxgene_census\n", - "import scanpy as sc\n", - "import pandas as pd\n", "import numpy as np\n", - "from scipy.sparse import coo_matrix\n", + "import pandas as pd\n", + "import scanpy as sc\n", "\n", "census = cellxgene_census.open_soma()" ] diff --git a/api/python/notebooks/analysis_demo/comp_bio_geneformer_prediction.ipynb b/api/python/notebooks/analysis_demo/comp_bio_geneformer_prediction.ipynb index e81c5d6a7..51ceecf2f 100644 --- a/api/python/notebooks/analysis_demo/comp_bio_geneformer_prediction.ipynb +++ b/api/python/notebooks/analysis_demo/comp_bio_geneformer_prediction.ipynb @@ -84,9 +84,10 @@ "metadata": {}, "outputs": [], "source": [ - "import cellxgene_census\n", "import json\n", "\n", + "import cellxgene_census\n", + "\n", "census = cellxgene_census.open_soma(census_version=\"2023-12-15\")\n", "\n", "geneformer_info = census[\"census_data\"][\"homo_sapiens\"].ms[\"RNA\"].obsm[\"geneformer\"].metadata\n", @@ -159,20 +160,19 @@ "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", - "from transformers import BertForSequenceClassification\n", - "from transformers import Trainer\n", - "from geneformer import DataCollatorForCellClassification\n", - "from geneformer import TranscriptomeTokenizer\n", - "from geneformer import EmbExtractor\n", - "from cellxgene_census.experimental import get_embedding\n", - "from cellxgene_census.experimental.ml.huggingface import GeneformerTokenizer\n", - "import datasets\n", "import json\n", "import os\n", - "import scanpy as sc\n", - "import numpy as np\n", + "\n", "import cellxgene_census\n", - "import tiledbsoma" + "import datasets\n", + "import numpy as np\n", + "import scanpy as sc\n", + "from geneformer import (\n", + " DataCollatorForCellClassification,\n", + " EmbExtractor,\n", + " TranscriptomeTokenizer,\n", + ")\n", + "from transformers import BertForSequenceClassification, Trainer" ] }, { @@ -300,7 +300,7 @@ "model_dir = \"./fine_tuned_geneformer/\"\n", "label_mapping_dict_file = os.path.join(model_dir, \"label_to_cell_subclass.json\")\n", "\n", - "with open(label_mapping_dict_file, \"r\") as fp:\n", + "with open(label_mapping_dict_file) as fp:\n", " label_mapping_dict = json.load(fp)" ] }, @@ -847,7 +847,10 @@ "# 1. https://cellxgene.cziscience.com/collections/c697eaaf-a3be-4251-b036-5f9052179e70\n", "# 2. https://cellxgene.cziscience.com/collections/f2a488bf-782f-4c20-a8e5-cb34d48c1f7e\n", "\n", - "dataset_ids = [\"fa8605cf-f27e-44af-ac2a-476bee4410d3\", \"3c75a463-6a87-4132-83a8-c3002624394d\"]\n", + "dataset_ids = [\n", + " \"fa8605cf-f27e-44af-ac2a-476bee4410d3\",\n", + " \"3c75a463-6a87-4132-83a8-c3002624394d\",\n", + "]\n", "\n", "adata_census = cellxgene_census.get_anndata(\n", " census=census,\n", diff --git a/api/python/notebooks/analysis_demo/comp_bio_normalizing_full_gene_sequencing.ipynb b/api/python/notebooks/analysis_demo/comp_bio_normalizing_full_gene_sequencing.ipynb index 1282165ad..c1951d54d 100644 --- a/api/python/notebooks/analysis_demo/comp_bio_normalizing_full_gene_sequencing.ipynb +++ b/api/python/notebooks/analysis_demo/comp_bio_normalizing_full_gene_sequencing.ipynb @@ -46,7 +46,6 @@ "source": [ "import cellxgene_census\n", "import scanpy as sc\n", - "import numpy as np\n", "from scipy.sparse import csr_matrix\n", "\n", "census = cellxgene_census.open_soma()" @@ -191,7 +190,9 @@ "source": [ "liver_dataset_id = \"4546e757-34d0-4d17-be06-538318925fcd\"\n", "liver_adata = cellxgene_census.get_anndata(\n", - " census, organism=\"Mus musculus\", obs_value_filter=f\"dataset_id=='{liver_dataset_id}'\"\n", + " census,\n", + " organism=\"Mus musculus\",\n", + " obs_value_filter=f\"dataset_id=='{liver_dataset_id}'\",\n", ")" ] }, diff --git a/api/python/notebooks/analysis_demo/comp_bio_scvi_model_use.ipynb b/api/python/notebooks/analysis_demo/comp_bio_scvi_model_use.ipynb index f86e1f315..bf71bc202 100644 --- a/api/python/notebooks/analysis_demo/comp_bio_scvi_model_use.ipynb +++ b/api/python/notebooks/analysis_demo/comp_bio_scvi_model_use.ipynb @@ -77,9 +77,10 @@ "metadata": {}, "outputs": [], "source": [ - "import cellxgene_census\n", "import json\n", "\n", + "import cellxgene_census\n", + "\n", "census = cellxgene_census.open_soma(census_version=\"2023-12-15\")\n", "\n", "scvi_info = census[\"census_data\"][\"homo_sapiens\"].ms[\"RNA\"].obsm[\"scvi\"].metadata\n", @@ -145,17 +146,11 @@ "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", - "import functools\n", - "import gc\n", - "import scanpy as sc\n", - "import os\n", - "import cellxgene_census\n", "import anndata\n", + "import cellxgene_census\n", "import numpy as np\n", - "import pandas as pd\n", + "import scanpy as sc\n", "import scvi\n", - "import tiledbsoma as soma\n", - "import yaml\n", "from sklearn.ensemble import RandomForestClassifier" ] }, @@ -455,7 +450,10 @@ "# Some PBMC data from these collections\n", "# 1. https://cellxgene.cziscience.com/collections/c697eaaf-a3be-4251-b036-5f9052179e70\n", "# 2. https://cellxgene.cziscience.com/collections/f2a488bf-782f-4c20-a8e5-cb34d48c1f7e\n", - "dataset_ids = [\"fa8605cf-f27e-44af-ac2a-476bee4410d3\", \"3c75a463-6a87-4132-83a8-c3002624394d\"]\n", + "dataset_ids = [\n", + " \"fa8605cf-f27e-44af-ac2a-476bee4410d3\",\n", + " \"3c75a463-6a87-4132-83a8-c3002624394d\",\n", + "]\n", "\n", "adata_census = cellxgene_census.get_anndata(\n", " census=census,\n", diff --git a/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb b/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb index 6cbda8e11..22909071f 100644 --- a/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb +++ b/api/python/notebooks/api_demo/census_access_maintained_embeddings.ipynb @@ -164,9 +164,10 @@ "metadata": {}, "outputs": [], "source": [ + "import warnings\n", + "\n", "import cellxgene_census\n", "import scanpy\n", - "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", @@ -269,8 +270,8 @@ "outputs": [], "source": [ "import cellxgene_census\n", - "import tiledbsoma as soma\n", "import scanpy\n", + "import tiledbsoma as soma\n", "\n", "census_version = \"2023-12-15\"\n", "\n", @@ -278,7 +279,8 @@ "\n", "experiment = census[\"census_data\"][\"homo_sapiens\"]\n", "query = experiment.axis_query(\n", - " measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'central nervous system'\")\n", + " measurement_name=\"RNA\",\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'central nervous system'\"),\n", ")" ] }, @@ -397,7 +399,8 @@ "\n", "experiment = census[\"census_data\"][\"homo_sapiens\"]\n", "query = experiment.axis_query(\n", - " measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'central nervous system'\")\n", + " measurement_name=\"RNA\",\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'central nervous system'\"),\n", ")" ] }, diff --git a/api/python/notebooks/api_demo/census_compute_over_X.ipynb b/api/python/notebooks/api_demo/census_compute_over_X.ipynb index dbf6b39d7..7ea88f4e1 100644 --- a/api/python/notebooks/api_demo/census_compute_over_X.ipynb +++ b/api/python/notebooks/api_demo/census_compute_over_X.ipynb @@ -32,10 +32,9 @@ }, "outputs": [], "source": [ + "import cellxgene_census\n", "import numpy as np\n", "import pandas as pd\n", - "\n", - "import cellxgene_census\n", "import tiledbsoma as soma\n", "from tiledbsoma.experiment_query import X_as_series" ] @@ -485,8 +484,7 @@ " n_variables: int\n", "\n", " def __init__(self, n_samples: int, n_variables: int):\n", - " \"\"\"\n", - " Compute mean and variance for n_variables over n_samples, encoded\n", + " \"\"\"Compute mean and variance for n_variables over n_samples, encoded\n", " in a COO format. Equivalent to:\n", " numpy.mean(data, axis=0)\n", " numpy.var(data, axix=0)\n", @@ -503,9 +501,7 @@ " _mean_variance_update(coord_vec, value_vec, self.n_a, self.u_a, self.M2_a)\n", "\n", " def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:\n", - " \"\"\"\n", - " Returns tuple containing mean and variance\n", - " \"\"\"\n", + " \"\"\"Returns tuple containing mean and variance\"\"\"\n", " u, M2 = _mean_variance_finalize(self.n_samples, self.n_a, self.u_a, self.M2_a)\n", "\n", " # compute sample variance\n", @@ -522,8 +518,7 @@ " u: npt.NDArray[np.float64],\n", " M2: npt.NDArray[np.float64],\n", "):\n", - " \"\"\"\n", - " Incrementally accumulate mean and sum of square of distance from mean using\n", + " \"\"\"Incrementally accumulate mean and sum of square of distance from mean using\n", " Welford's online method.\n", " \"\"\"\n", " for col, val in zip(col_arr, val_arr):\n", @@ -536,10 +531,12 @@ "\n", "@numba.jit(nopython=True)\n", "def _mean_variance_finalize(\n", - " n_samples: int, n_a: npt.NDArray[np.int32], u_a: npt.NDArray[np.float64], M2_a: npt.NDArray[np.float64]\n", + " n_samples: int,\n", + " n_a: npt.NDArray[np.int32],\n", + " u_a: npt.NDArray[np.float64],\n", + " M2_a: npt.NDArray[np.float64],\n", "):\n", - " \"\"\"\n", - " Finalize incremental values, acconting for missing elements (due to sparse input).\n", + " \"\"\"Finalize incremental values, acconting for missing elements (due to sparse input).\n", " Non-sparse and sparse combined using Chan's parallel adaptation of Welford's.\n", " The code assumes the sparse elements are all zero and ignores those terms.\n", " \"\"\"\n", @@ -783,7 +780,8 @@ " n_cells_by_dataset = pd.Series(\n", " 0,\n", " index=pd.MultiIndex.from_product(\n", - " (var_df.index, obs_df.dataset_id.unique()), names=[\"soma_joinid\", \"dataset_id\"]\n", + " (var_df.index, obs_df.dataset_id.unique()),\n", + " names=[\"soma_joinid\", \"dataset_id\"],\n", " ),\n", " dtype=np.int64,\n", " name=\"n_cells\",\n", @@ -800,7 +798,9 @@ " .value_counts()\n", " )\n", " np.add.at(\n", - " n_cells_by_dataset, n_cells_by_dataset.index.get_indexer(value_counts.index), value_counts.to_numpy()\n", + " n_cells_by_dataset,\n", + " n_cells_by_dataset.index.get_indexer(value_counts.index),\n", + " value_counts.to_numpy(),\n", " )\n", "\n", " # drop any combinations that are not observed\n", diff --git a/api/python/notebooks/api_demo/census_dataset_presence.ipynb b/api/python/notebooks/api_demo/census_dataset_presence.ipynb index 5ba517b89..9dcc5fe22 100644 --- a/api/python/notebooks/api_demo/census_dataset_presence.ipynb +++ b/api/python/notebooks/api_demo/census_dataset_presence.ipynb @@ -49,8 +49,6 @@ } ], "source": [ - "import numpy as np\n", - "from scipy import sparse\n", "import cellxgene_census\n", "\n", "census = cellxgene_census.open_soma()" diff --git a/api/python/notebooks/api_demo/census_embedding.ipynb b/api/python/notebooks/api_demo/census_embedding.ipynb index ddc48fc7a..3f8b04190 100644 --- a/api/python/notebooks/api_demo/census_embedding.ipynb +++ b/api/python/notebooks/api_demo/census_embedding.ipynb @@ -163,13 +163,12 @@ "source": [ "# A few imports and utility functions used throughout this notebook\n", "\n", - "import json\n", + "\n", + "import warnings\n", "\n", "import cellxgene_census\n", - "import scanpy\n", "import numpy as np\n", - "import warnings\n", - "import scipy.sparse as sp\n", + "import scanpy\n", "import tiledbsoma as soma\n", "from cellxgene_census.experimental import get_embedding, get_embedding_metadata\n", "\n", @@ -288,7 +287,8 @@ "\n", "experiment = census[\"census_data\"][EXPERIMENT_NAME]\n", "query = experiment.axis_query(\n", - " measurement_name=\"RNA\", obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'central nervous system'\")\n", + " measurement_name=\"RNA\",\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'central nervous system'\"),\n", ")" ] }, @@ -584,8 +584,6 @@ "# Load a portion of the embedding (caution: embeddings can be quite large)\n", "\n", "import cellxgene_census\n", - "import numpy as np\n", - "import scipy.sparse as sp\n", "import tiledbsoma as soma\n", "\n", "# Fetch first 500_000 joinids from the embedding.\n", @@ -612,7 +610,8 @@ "with cellxgene_census.open_soma(census_version=CENSUS_VERSION) as census:\n", " experiment = census[\"census_data\"][EXPERIMENT_NAME]\n", " with experiment.axis_query(\n", - " measurement_name=MEASUREMENT_NAME, obs_query=soma.AxisQuery(coords=(embedding_joinids,))\n", + " measurement_name=MEASUREMENT_NAME,\n", + " obs_query=soma.AxisQuery(coords=(embedding_joinids,)),\n", " ) as query:\n", " obs_df = query.obs(column_names=[\"soma_joinid\", \"suspension_type\"]).concat().to_pandas()\n", "\n", @@ -671,8 +670,6 @@ } ], "source": [ - "from cellxgene_census.experimental import get_embedding_metadata\n", - "\n", "embedding_metadata = get_embedding_metadata(EMBEDDING_URI)\n", "\n", "embedding_metadata" diff --git a/api/python/notebooks/experimental/highly_variable_genes.ipynb b/api/python/notebooks/experimental/highly_variable_genes.ipynb index ddb4edf81..f5a69c031 100644 --- a/api/python/notebooks/experimental/highly_variable_genes.ipynb +++ b/api/python/notebooks/experimental/highly_variable_genes.ipynb @@ -42,9 +42,12 @@ "source": [ "# Import packages\n", "import cellxgene_census\n", - "from cellxgene_census.experimental.pp import get_highly_variable_genes, highly_variable_genes\n", "import pandas as pd\n", - "import tiledbsoma as soma" + "import tiledbsoma as soma\n", + "from cellxgene_census.experimental.pp import (\n", + " get_highly_variable_genes,\n", + " highly_variable_genes,\n", + ")" ] }, { diff --git a/api/python/notebooks/experimental/mean_variance.ipynb b/api/python/notebooks/experimental/mean_variance.ipynb index 4513f7a01..fefa18be2 100644 --- a/api/python/notebooks/experimental/mean_variance.ipynb +++ b/api/python/notebooks/experimental/mean_variance.ipynb @@ -44,9 +44,9 @@ "source": [ "# Import packages\n", "import cellxgene_census\n", - "from cellxgene_census.experimental.pp import mean_variance\n", + "import pandas as pd\n", "import tiledbsoma as soma\n", - "import pandas as pd" + "from cellxgene_census.experimental.pp import mean_variance" ] }, { diff --git a/api/python/notebooks/experimental/pca.ipynb b/api/python/notebooks/experimental/pca.ipynb index 6a805f63c..56eb8ff04 100644 --- a/api/python/notebooks/experimental/pca.ipynb +++ b/api/python/notebooks/experimental/pca.ipynb @@ -31,7 +31,6 @@ "from cellxgene_census.experimental.pp import highly_variable_genes\n", "from sklearn.decomposition import IncrementalPCA\n", "\n", - "\n", "\"\"\"\n", "Configuration - the dataset and computational parameters.\n", "\"\"\"\n", diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb index f2d8dd0d5..2ed46d472 100644 --- a/api/python/notebooks/experimental/pytorch.ipynb +++ b/api/python/notebooks/experimental/pytorch.ipynb @@ -38,8 +38,8 @@ "id": "c3dd549f", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:19.390343Z", - "end_time": "2023-10-09T18:20:21.600206Z" + "end_time": "2023-10-09T18:20:21.600206Z", + "start_time": "2023-10-09T18:20:19.390343Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:00.392773Z", @@ -80,8 +80,8 @@ "id": "54896e6f", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:21.830683Z", - "end_time": "2023-10-09T18:20:22.278894Z" + "end_time": "2023-10-09T18:20:22.278894Z", + "start_time": "2023-10-09T18:20:21.830683Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:02.884588Z", @@ -118,17 +118,20 @@ }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "### `ExperimentDataPipe` class explained\n", "\n", "This class provides an implementation of PyTorch's [DataPipe interface](https://pytorch.org/data/main/torchdata.datapipes.iter.html), which defines a common mechanism for wrapping and accessing training data from any underlying source. The `ExperimentDataPipe` class encapsulates the details of querying and retrieving Census data from a single SOMA `Experiment` and returning it to the caller as PyTorch Tensors. Most importantly, it retrieves the data lazily from the Census in batches, avoiding having to load the entire training dataset into memory at once. (Note: PyTorch also provides `DataSet` as a legacy interface for wrapping and accessing training data sources, but a `DataPipe` can be used interchangeably.)\n" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "### `ExperimentDataPipe` parameters explained\n", "\n", @@ -146,10 +149,7 @@ "\n", "The `soma_chunk_size` sets the number of rows of data that are retrieved from the Census and held in memory at a given time. This controls\n", " the maximum memory usage of the `ExperimentDataPipe`. Smaller values will require less memory but will also result in lower read performance. If you are running out of memory when training a model, try reducing this value. The default is set to retrieve ~1GB of data per chunk, which takes into account how many `var` (gene) columns are being requested. This parameter also affects the granularity of the \"global\" shuffling step when `shuffle=True` (see ``shuffle`` parameter API docs for details)." - ], - "metadata": { - "collapsed": false - } + ] }, { "attachments": {}, @@ -166,8 +166,8 @@ "id": "70a2ddbe", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:22.821101Z", - "end_time": "2023-10-09T18:20:32.049618Z" + "end_time": "2023-10-09T18:20:32.049618Z", + "start_time": "2023-10-09T18:20:22.821101Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:03.359927Z", @@ -179,7 +179,9 @@ "outputs": [ { "data": { - "text/plain": "(15020, 60664)" + "text/plain": [ + "(15020, 60664)" + ] }, "execution_count": 27, "metadata": {}, @@ -207,8 +209,8 @@ "id": "133f594f", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:32.051289Z", - "end_time": "2023-10-09T18:20:32.052795Z" + "end_time": "2023-10-09T18:20:32.052795Z", + "start_time": "2023-10-09T18:20:32.051289Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:05.527106Z", @@ -239,8 +241,8 @@ "id": "39d30df2", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:32.052898Z", - "end_time": "2023-10-09T18:20:32.056886Z" + "end_time": "2023-10-09T18:20:32.056886Z", + "start_time": "2023-10-09T18:20:32.052898Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:05.538514Z", @@ -280,8 +282,8 @@ "id": "6b792b4b", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:32.058875Z", - "end_time": "2023-10-09T18:20:32.060262Z" + "end_time": "2023-10-09T18:20:32.060262Z", + "start_time": "2023-10-09T18:20:32.058875Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:05.543534Z", @@ -297,7 +299,7 @@ "\n", "class LogisticRegression(torch.nn.Module):\n", " def __init__(self, input_dim, output_dim):\n", - " super(LogisticRegression, self).__init__()\n", + " super(LogisticRegression, self).__init__() # noqa: UP008\n", " self.linear = torch.nn.Linear(input_dim, output_dim)\n", "\n", " def forward(self, x):\n", @@ -320,8 +322,8 @@ "id": "b744cd21", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:32.307661Z", - "end_time": "2023-10-09T18:20:32.310829Z" + "end_time": "2023-10-09T18:20:32.310829Z", + "start_time": "2023-10-09T18:20:32.307661Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:05.549312Z", @@ -431,8 +433,8 @@ "id": "733ec2fb", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:20:32.311816Z", - "end_time": "2023-10-09T18:29:31.028253Z" + "end_time": "2023-10-09T18:29:31.028253Z", + "start_time": "2023-10-09T18:20:32.311816Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:33:05.564772Z", @@ -495,8 +497,8 @@ "id": "d3e33edc", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:29:31.705548Z", - "end_time": "2023-10-09T18:29:59.425527Z" + "end_time": "2023-10-09T18:29:59.425527Z", + "start_time": "2023-10-09T18:29:31.705548Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:34:04.804402Z", @@ -526,8 +528,8 @@ "id": "00e12182", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:29:59.429107Z", - "end_time": "2023-10-09T18:29:59.438079Z" + "end_time": "2023-10-09T18:29:59.438079Z", + "start_time": "2023-10-09T18:29:59.429107Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:34:09.334916Z", @@ -539,7 +541,16 @@ "outputs": [ { "data": { - "text/plain": "tensor([ 1, 11, 1, 1, 5, 1, 1, 1, 1, 5, 1, 5, 1, 5, 5, 8, 1, 1,\n 7, 1, 5, 5, 8, 5, 5, 1, 1, 1, 1, 8, 9, 1, 1, 8, 1, 1,\n 1, 11, 5, 1, 8, 5, 5, 1, 5, 1, 5, 5, 1, 5, 9, 8, 1, 1,\n 1, 5, 5, 5, 1, 5, 1, 5, 1, 1, 5, 8, 1, 1, 1, 1, 7, 1,\n 5, 1, 1, 5, 5, 1, 1, 8, 5, 5, 8, 1, 1, 1, 5, 5, 5, 1,\n 5, 1, 5, 5, 1, 1, 5, 1, 5, 1, 1, 1, 5, 1, 1, 1, 9, 5,\n 1, 1, 7, 1, 1, 1, 1, 8, 1, 1, 5, 5, 1, 5, 1, 1, 1, 5,\n 8, 1])" + "text/plain": [ + "tensor([ 1, 11, 1, 1, 5, 1, 1, 1, 1, 5, 1, 5, 1, 5, 5, 8, 1, 1,\n", + " 7, 1, 5, 5, 8, 5, 5, 1, 1, 1, 1, 8, 9, 1, 1, 8, 1, 1,\n", + " 1, 11, 5, 1, 8, 5, 5, 1, 5, 1, 5, 5, 1, 5, 9, 8, 1, 1,\n", + " 1, 5, 5, 5, 1, 5, 1, 5, 1, 1, 5, 8, 1, 1, 1, 1, 7, 1,\n", + " 5, 1, 1, 5, 5, 1, 1, 8, 5, 5, 8, 1, 1, 1, 5, 5, 5, 1,\n", + " 5, 1, 5, 5, 1, 1, 5, 1, 5, 1, 1, 1, 5, 1, 1, 1, 9, 5,\n", + " 1, 1, 7, 1, 1, 1, 1, 8, 1, 1, 5, 5, 1, 5, 1, 1, 1, 5,\n", + " 8, 1])" + ] }, "metadata": {}, "output_type": "display_data" @@ -574,8 +585,8 @@ "id": "1cfff865", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:29:59.439561Z", - "end_time": "2023-10-09T18:29:59.441907Z" + "end_time": "2023-10-09T18:29:59.441907Z", + "start_time": "2023-10-09T18:29:59.439561Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:34:09.343375Z", @@ -587,7 +598,41 @@ "outputs": [ { "data": { - "text/plain": "array(['basal cell', 'vein endothelial cell', 'basal cell', 'basal cell',\n 'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n 'basal cell', 'epithelial cell', 'epithelial cell', 'leukocyte',\n 'basal cell', 'basal cell', 'keratinocyte', 'basal cell',\n 'epithelial cell', 'epithelial cell', 'leukocyte',\n 'epithelial cell', 'epithelial cell', 'basal cell', 'basal cell',\n 'basal cell', 'basal cell', 'leukocyte', 'pericyte', 'basal cell',\n 'basal cell', 'leukocyte', 'basal cell', 'basal cell',\n 'basal cell', 'vein endothelial cell', 'epithelial cell',\n 'basal cell', 'leukocyte', 'epithelial cell', 'epithelial cell',\n 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n 'epithelial cell', 'basal cell', 'epithelial cell', 'pericyte',\n 'leukocyte', 'basal cell', 'basal cell', 'basal cell',\n 'epithelial cell', 'epithelial cell', 'epithelial cell',\n 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n 'basal cell', 'basal cell', 'epithelial cell', 'leukocyte',\n 'basal cell', 'basal cell', 'basal cell', 'basal cell',\n 'keratinocyte', 'basal cell', 'epithelial cell', 'basal cell',\n 'basal cell', 'epithelial cell', 'epithelial cell', 'basal cell',\n 'basal cell', 'leukocyte', 'epithelial cell', 'epithelial cell',\n 'leukocyte', 'basal cell', 'basal cell', 'basal cell',\n 'epithelial cell', 'epithelial cell', 'epithelial cell',\n 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n 'epithelial cell', 'basal cell', 'basal cell', 'epithelial cell',\n 'basal cell', 'epithelial cell', 'basal cell', 'basal cell',\n 'basal cell', 'epithelial cell', 'basal cell', 'basal cell',\n 'basal cell', 'pericyte', 'epithelial cell', 'basal cell',\n 'basal cell', 'keratinocyte', 'basal cell', 'basal cell',\n 'basal cell', 'basal cell', 'leukocyte', 'basal cell',\n 'basal cell', 'epithelial cell', 'epithelial cell', 'basal cell',\n 'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n 'epithelial cell', 'leukocyte', 'basal cell'], dtype=object)" + "text/plain": [ + "array(['basal cell', 'vein endothelial cell', 'basal cell', 'basal cell',\n", + " 'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n", + " 'basal cell', 'epithelial cell', 'epithelial cell', 'leukocyte',\n", + " 'basal cell', 'basal cell', 'keratinocyte', 'basal cell',\n", + " 'epithelial cell', 'epithelial cell', 'leukocyte',\n", + " 'epithelial cell', 'epithelial cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'basal cell', 'leukocyte', 'pericyte', 'basal cell',\n", + " 'basal cell', 'leukocyte', 'basal cell', 'basal cell',\n", + " 'basal cell', 'vein endothelial cell', 'epithelial cell',\n", + " 'basal cell', 'leukocyte', 'epithelial cell', 'epithelial cell',\n", + " 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n", + " 'epithelial cell', 'basal cell', 'epithelial cell', 'pericyte',\n", + " 'leukocyte', 'basal cell', 'basal cell', 'basal cell',\n", + " 'epithelial cell', 'epithelial cell', 'epithelial cell',\n", + " 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n", + " 'basal cell', 'basal cell', 'epithelial cell', 'leukocyte',\n", + " 'basal cell', 'basal cell', 'basal cell', 'basal cell',\n", + " 'keratinocyte', 'basal cell', 'epithelial cell', 'basal cell',\n", + " 'basal cell', 'epithelial cell', 'epithelial cell', 'basal cell',\n", + " 'basal cell', 'leukocyte', 'epithelial cell', 'epithelial cell',\n", + " 'leukocyte', 'basal cell', 'basal cell', 'basal cell',\n", + " 'epithelial cell', 'epithelial cell', 'epithelial cell',\n", + " 'basal cell', 'epithelial cell', 'basal cell', 'epithelial cell',\n", + " 'epithelial cell', 'basal cell', 'basal cell', 'epithelial cell',\n", + " 'basal cell', 'epithelial cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'epithelial cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'pericyte', 'epithelial cell', 'basal cell',\n", + " 'basal cell', 'keratinocyte', 'basal cell', 'basal cell',\n", + " 'basal cell', 'basal cell', 'leukocyte', 'basal cell',\n", + " 'basal cell', 'epithelial cell', 'epithelial cell', 'basal cell',\n", + " 'epithelial cell', 'basal cell', 'basal cell', 'basal cell',\n", + " 'epithelial cell', 'leukocyte', 'basal cell'], dtype=object)" + ] }, "metadata": {}, "output_type": "display_data" @@ -616,8 +661,8 @@ "id": "f4ac8087", "metadata": { "ExecuteTime": { - "start_time": "2023-10-09T18:29:59.443175Z", - "end_time": "2023-10-09T18:29:59.471320Z" + "end_time": "2023-10-09T18:29:59.471320Z", + "start_time": "2023-10-09T18:29:59.443175Z" }, "execution": { "iopub.execute_input": "2023-07-28T16:34:09.350404Z", @@ -629,8 +674,106 @@ "outputs": [ { "data": { - "text/plain": " actual cell type predicted cell type\n0 basal cell basal cell\n1 vein endothelial cell vein endothelial cell\n2 basal cell basal cell\n3 basal cell basal cell\n4 epithelial cell epithelial cell\n.. ... ...\n123 basal cell basal cell\n124 basal cell basal cell\n125 epithelial cell epithelial cell\n126 leukocyte leukocyte\n127 basal cell basal cell\n\n[128 rows x 2 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
actual cell typepredicted cell type
0basal cellbasal cell
1vein endothelial cellvein endothelial cell
2basal cellbasal cell
3basal cellbasal cell
4epithelial cellepithelial cell
.........
123basal cellbasal cell
124basal cellbasal cell
125epithelial cellepithelial cell
126leukocyteleukocyte
127basal cellbasal cell
\n

128 rows × 2 columns

\n
" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
actual cell typepredicted cell type
0basal cellbasal cell
1vein endothelial cellvein endothelial cell
2basal cellbasal cell
3basal cellbasal cell
4epithelial cellepithelial cell
.........
123basal cellbasal cell
124basal cellbasal cell
125epithelial cellepithelial cell
126leukocyteleukocyte
127basal cellbasal cell
\n", + "

128 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " actual cell type predicted cell type\n", + "0 basal cell basal cell\n", + "1 vein endothelial cell vein endothelial cell\n", + "2 basal cell basal cell\n", + "3 basal cell basal cell\n", + "4 epithelial cell epithelial cell\n", + ".. ... ...\n", + "123 basal cell basal cell\n", + "124 basal cell basal cell\n", + "125 epithelial cell epithelial cell\n", + "126 leukocyte leukocyte\n", + "127 basal cell basal cell\n", + "\n", + "[128 rows x 2 columns]" + ] }, "metadata": {}, "output_type": "display_data" diff --git a/api/python/notebooks/pyproject.toml b/api/python/notebooks/pyproject.toml index 08afbe28b..b1b44179c 100644 --- a/api/python/notebooks/pyproject.toml +++ b/api/python/notebooks/pyproject.toml @@ -1,6 +1,82 @@ -[tool.black] +[tool.ruff] line-length = 120 -target_version = ['py39'] +src = ["api/python/notebooks"] +target-version = "py38" + +[tool.ruff.lint] +select = [ + "F", # Errors detected by Pyflakes + "E", # Error detected by Pycodestyle + "W", # Warning detected by Pycodestyle + "I", # isort + "D", # pydocstyle + "B", # flake8-bugbear + "TID", # flake8-tidy-imports + "C4", # flake8-comprehensions + "BLE", # flake8-blind-except + "UP", # pyupgrade + "RUF100", # Report unused noqa directives +] +ignore = [ + # line too long -> we accept long comment lines; formatter gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E731", + # allow I, O, l as variable names -> I is the identity matrix + "E741", + # Missing docstring in public package + "D104", + # Missing docstring in public module + "D100", + # Missing docstring in __init__ + "D107", + # Errors from function calls in argument defaults. These are fine when the result is immutable. + "B008", + # __magic__ methods are are often self-explanatory, allow missing docstrings + "D105", + # first line should end with a period [Bug: doesn't work with single-line docstrings] + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + ## Disable one in each pair of mutually incompatible rules + # We don’t want a blank line before a class docstring + "D203", + # We want docstrings to start immediately after the opening triple quote + "D213", + # Missing argument description in the docstring TODO: enable + "D417", + # Blank line required between summary line and description TODO: enable + "D205", + # Prefer absolute imports over relative imports from parent modules TODO: enable + "TID252", + # Found useless expression. Either assign it to a variable or remove it TODO: enable + "B018", + # Module level import not at top of cell TODO: enable + "E402", + # Loop control variable `i` not used within loop body TODO: enable + "B007", + # Missing docstring in public class TODO: enable + "D101", + # Missing docstring in public method TODO: enable + "D102", + # Missing docstring in public function TODO: enable + "D103", + # First line should end with a period, question mark, or exclamation point TODO: enable + "D415", +] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" [tool.mypy] show_error_codes = true @@ -8,9 +84,3 @@ ignore_missing_imports = true warn_unreachable = true strict = true plugins = "numpy.typing.mypy_plugin" - -[tool.ruff] -select = ["E", "F", "B", "I"] -ignore = ["E501", "E402", "C408", ] -line-length = 120 -target-version = "py39" diff --git a/tools/cell_dup_check/finddups.ipynb b/tools/cell_dup_check/finddups.ipynb index bb82604f8..ee365a0b5 100644 --- a/tools/cell_dup_check/finddups.ipynb +++ b/tools/cell_dup_check/finddups.ipynb @@ -24,15 +24,13 @@ "outputs": [], "source": [ "import math\n", - "import xxhash # https://github.com/ifduyue/python-xxhash\n", "from typing import Literal\n", "\n", - "import ipywidgets\n", - "\n", "import cellxgene_census\n", + "import ipywidgets\n", "import numpy as np\n", "import pandas as pd\n", - "\n", + "import xxhash # https://github.com/ifduyue/python-xxhash\n", "\n", "\"\"\"\n", "Configuration - pick the Census version and experiment to utilize\n", @@ -70,13 +68,7 @@ "\n", "with cellxgene_census.open_soma(census_version=census_version) as census:\n", " # Used for reporting\n", - " datasets_df = (\n", - " census[\"census_info\"][\"datasets\"]\n", - " .read()\n", - " .concat()\n", - " .to_pandas()\n", - " .drop(columns=[\"soma_joinid\"])\n", - " )\n", + " datasets_df = census[\"census_info\"][\"datasets\"].read().concat().to_pandas().drop(columns=[\"soma_joinid\"])\n", "\n", " # Calculate all per-cell hashes\n", " exp = census[\"census_data\"][experiment]\n", @@ -239,11 +231,9 @@ "source": [ "\"\"\"Compute a summary pivot on the hash and is_primary_data\"\"\"\n", "hash_primary_pivot = (\n", - " obs_df.value_counts(subset=[\"hash\", \"is_primary_data\"])\n", - " .to_frame()\n", - " .reset_index()\n", + " obs_df.value_counts(subset=[\"hash\", \"is_primary_data\"]).to_frame().reset_index()\n", " # somehow values=\"count\" retrieves an error since the \"count\" column gets rename to 0\n", - " #.pivot_table(index=\"hash\", columns=\"is_primary_data\", values=\"count\", fill_value=0)\n", + " # .pivot_table(index=\"hash\", columns=\"is_primary_data\", values=\"count\", fill_value=0)\n", ")\n", "hash_primary_pivot" ] @@ -746,9 +736,7 @@ " .reset_index()\n", " .set_index(\"soma_joinid\")\n", ")\n", - "obs_duplicate_primary = obs_duplicate_primary[\n", - " obs_duplicate_primary.is_primary_data == True\n", - "]\n", + "obs_duplicate_primary = obs_duplicate_primary[obs_duplicate_primary.is_primary_data is True]\n", "obs_duplicate_primary" ] }, @@ -837,18 +825,16 @@ "\"\"\"\n", "For each duplicated hash (cell), find the datasets that overlap/contain the duplicate.\n", "\n", - "In other words, these are all of the datasets which have intersecting duplicate/primary hashes. \n", + "In other words, these are all of the datasets which have intersecting duplicate/primary hashes.\n", "\"\"\"\n", "overlapping_dup_datasets = np.unique(\n", - " obs_duplicate_primary.sort_values(\"dataset_id\")[[\"hash\", \"dataset_id\"]]\n", - " .groupby(by=\"hash\")\n", - " .agg(list)\n", + " obs_duplicate_primary.sort_values(\"dataset_id\")[[\"hash\", \"dataset_id\"]].groupby(by=\"hash\").agg(list)\n", ")\n", "\n", "display(overlapping_dup_datasets)\n", "\n", "for d in overlapping_dup_datasets:\n", - " display(datasets_df.set_index('dataset_id').loc[d])" + " display(datasets_df.set_index(\"dataset_id\").loc[d])" ] } ], diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/__main__.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/__main__.py index 2992eaa37..734789668 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/__main__.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/__main__.py @@ -3,7 +3,7 @@ import os import pathlib import sys -from typing import Callable, Sequence +from collections.abc import Callable, Sequence from urllib.parse import urlparse import s3fs @@ -105,9 +105,7 @@ def main() -> int: def _do_steps( build_steps: Sequence[Callable[[CensusBuildArgs], bool]], args: CensusBuildArgs, skip_completed_steps: bool = False ) -> int: - """ - Performs a series of steps as specified by the `build_steps` argument. - """ + """Performs a series of steps as specified by the `build_steps` argument.""" try: for n, build_step in enumerate(build_steps, start=1): step_n_of = f"Build step {build_step.__name__} [{n} of {len(build_steps)}]" @@ -124,7 +122,7 @@ def _do_steps( args.state.commit(args.working_dir / CENSUS_BUILD_STATE) logger.info(f"{step_n_of}: complete") - except Exception: + except Exception: # noqa: BLE001 logger.critical("Caught exception, exiting", exc_info=True) return 1 @@ -190,7 +188,7 @@ def do_create_reports(args: CensusBuildArgs) -> bool: def do_mock_build(args: CensusBuildArgs) -> bool: - """Mock build. Used for testing""" + """Mock build. Used for testing.""" args.soma_path.mkdir(parents=True, exist_ok=False) args.h5ads_path.mkdir(parents=True, exist_ok=False) with open(f"{args.soma_path}/test.soma", "w") as f: @@ -202,7 +200,7 @@ def do_mock_build(args: CensusBuildArgs) -> bool: def do_data_copy(args: CensusBuildArgs) -> bool: - """Copy data to S3, in preparation for a release""" + """Copy data to S3, in preparation for a release.""" from .data_copy import sync_to_S3 sync_to_S3( @@ -214,10 +212,7 @@ def do_data_copy(args: CensusBuildArgs) -> bool: def do_the_release(args: CensusBuildArgs) -> bool: - """ - Perform the release by publishing changes to the release.json file. Respects `dryrun` flag. - """ - + """Perform the release by publishing changes to the release.json file. Respects `dryrun` flag.""" from .release_manifest import CensusVersionDescription, make_a_release parsed_url = urlparse(args.config.cellxgene_census_S3_path) @@ -258,7 +253,7 @@ def do_report_copy(args: CensusBuildArgs) -> bool: def do_old_release_cleanup(args: CensusBuildArgs) -> bool: - """Clean up old releases""" + """Clean up old releases.""" from .release_cleanup import remove_releases_older_than remove_releases_older_than( @@ -270,7 +265,7 @@ def do_old_release_cleanup(args: CensusBuildArgs) -> bool: def do_log_copy(args: CensusBuildArgs) -> bool: - """Copy logs to S3 for posterity. Should be the final step, to capture full output of build""" + """Copy logs to S3 for posterity. Should be the final step, to capture full output of build.""" from .data_copy import sync_to_S3 sync_to_S3( @@ -282,7 +277,7 @@ def do_log_copy(args: CensusBuildArgs) -> bool: def do_sync_release_file_to_replica_s3_bucket(args: CensusBuildArgs) -> bool: - """Copy release.json to replica S3 bucket""" + """Copy release.json to replica S3 bucket.""" from .data_copy import sync_to_S3_remote source_key = urlcat(args.config.cellxgene_census_S3_path, args.build_tag, "release.json") @@ -297,8 +292,7 @@ def do_sync_release_file_to_replica_s3_bucket(args: CensusBuildArgs) -> bool: def do_sync_to_replica_s3_bucket(args: CensusBuildArgs) -> bool: - """ - Sync data to replica S3 bucket. Syncs everything and deletes anything + """Sync data to replica S3 bucket. Syncs everything and deletes anything in the replica bucket that is not in the primary bucket. """ from .data_copy import sync_to_S3_remote diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py index a50c8003b..8ea492ac2 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/anndata.py @@ -1,7 +1,7 @@ import logging from functools import cached_property from os import PathLike -from typing import Any, List, Optional, Protocol, Self, Tuple, TypedDict, cast +from typing import Any, Protocol, Self, TypedDict, cast import h5py import numpy as np @@ -16,13 +16,10 @@ logger = logging.getLogger(__name__) -AnnDataFilterSpec = TypedDict( - "AnnDataFilterSpec", - { - "organism_ontology_term_id": Optional[str], - "assay_ontology_term_ids": Optional[List[str]], - }, -) + +class AnnDataFilterSpec(TypedDict): + organism_ontology_term_id: str | None + assay_ontology_term_ids: list[str] | None # Indexing types @@ -31,7 +28,7 @@ def _slice_index(prev: Index1D, new: Index1D, length: int) -> slice | npt.NDArray[np.int64]: - """Slice an index""" + """Slice an index.""" if isinstance(prev, slice): if isinstance(new, slice): # conveniently, ranges support indexing! @@ -61,8 +58,7 @@ def _normed_index(idx: Index) -> tuple[Index1D, Index1D]: class AnnDataProxy: - """ - Recommend using `open_anndata()` rather than instantiating this class directly. + """Recommend using `open_anndata()` rather than instantiating this class directly. AnnData-like proxy for the version 0.1.0 AnnData H5PY file encoding (aka H5AD). Used in lieu of the AnnData class to reduce memory overhead. Semantics very similar @@ -95,8 +91,8 @@ def __init__( view_of: Self | None = None, obs_idx: slice | npt.NDArray[np.int64] | None = None, var_idx: slice | npt.NDArray[np.int64] | None = None, - obs_column_names: Optional[Tuple[str, ...]] = None, - var_column_names: Optional[Tuple[str, ...]] = None, + obs_column_names: tuple[str, ...] | None = None, + var_column_names: tuple[str, ...] | None = None, ): self.filename = filename @@ -160,7 +156,7 @@ def __getitem__(self, key: Index) -> "AnnDataProxy": vdx = _slice_index(self._var_idx, vdx, self.n_vars) return AnnDataProxy(self.filename, view_of=self, obs_idx=odx, var_idx=vdx) - def _load_dataframe(self, elem: h5py.Group, column_names: Optional[Tuple[str, ...]]) -> pd.DataFrame: + def _load_dataframe(self, elem: h5py.Group, column_names: tuple[str, ...] | None) -> pd.DataFrame: # if reading all, just use the built-in if not column_names: return cast(pd.DataFrame, read_elem(elem)) @@ -173,18 +169,18 @@ def _load_dataframe(self, elem: h5py.Group, column_names: Optional[Tuple[str, .. ), "Unsupported AnnData encoding-type or encoding-version - likely indicates file was created with an unsupported AnnData version" column_order = elem.attrs["column-order"] column_names_ordered = [c for c in column_order if c in column_names and c != "_index"] - index: Optional[npt.NDArray[Any]] = None + index: npt.NDArray[Any] | None = None if "_index" in column_names: index_col_name = elem.attrs["_index"] index = read_elem(elem[index_col_name]) return pd.DataFrame({c: read_elem(elem[c]) for c in column_names_ordered}, index=index) def _load_h5ad( - self, obs_column_names: Optional[Tuple[str, ...]], var_column_names: Optional[Tuple[str, ...]] + self, obs_column_names: tuple[str, ...] | None, var_column_names: tuple[str, ...] | None ) -> tuple[pd.DataFrame, pd.DataFrame, CSRDataset | CSCDataset | h5py.Dataset]: - """ - A memory optimization to prevent reading unnecessary data from the H5AD. This includes - skipping: + """A memory optimization to prevent reading unnecessary data from the H5AD. + + This includes skipping: * obsm/varm/obsp/varp * unused obs/var columns * reading both raw and !raw @@ -200,7 +196,6 @@ def _load_h5ad( This code utilizes the AnnData on-disk spec and several experimental API (as of 0.10.0). Spec: https://anndata.readthedocs.io/en/latest/fileformat-prose.html """ - file = h5py.File(self.filename, mode="r") # Known to be compatible with this AnnData file encoding @@ -241,11 +236,10 @@ def open_anndata( dataset: Dataset, *, include_filter_columns: bool = False, - obs_column_names: Optional[Tuple[str, ...]] = None, - var_column_names: Optional[Tuple[str, ...]] = None, + obs_column_names: tuple[str, ...] | None = None, + var_column_names: tuple[str, ...] | None = None, ) -> AnnDataProxy: - """ - Open the dataset and return an AnnData-like AnnDataProxy object. + """Open the dataset and return an AnnData-like AnnDataProxy object. Args: {obs,var}_column_names: if specified, determine which columns are loaded for the respective dataframes. @@ -253,7 +247,6 @@ def open_anndata( include_filter_columns: if True, ensure that any obs/var columns required for H5AD filtering are included. If False (default), only load the columsn specified by the user. """ - if include_filter_columns: obs_column_names = tuple(set(CXG_OBS_COLUMNS_MINIMUM_READ + (obs_column_names or ()))) var_column_names = tuple(set(CXG_VAR_COLUMNS_MINIMUM_READ + (var_column_names or ()))) @@ -271,8 +264,7 @@ def __call__(self, ad: AnnDataProxy) -> AnnDataProxy: def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFunction: - """ - Return an anndata sliced/filtered for those cells/genes of interest. + """Return an anndata sliced/filtered for those cells/genes of interest. obs filter: * not organoid or cell culture @@ -283,7 +275,6 @@ def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFun var filter: * genes only (var.feature_biotype == 'gene') """ - organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None) assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None) diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py index b83e2d6e0..f6daf9630 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py @@ -2,8 +2,8 @@ import logging import os import pathlib -from datetime import datetime, timezone -from typing import Iterator, List +from collections.abc import Iterator +from datetime import UTC, datetime import tiledbsoma as soma @@ -35,9 +35,7 @@ def prepare_file_system(args: CensusBuildArgs) -> None: - """ - Prepares the file system for the builder run - """ + """Prepares the file system for the builder run.""" # Don't clobber an existing census build if args.soma_path.exists() or args.h5ads_path.exists(): raise Exception("Census build path already exists - aborting build") @@ -52,7 +50,8 @@ def prepare_file_system(args: CensusBuildArgs) -> None: def build(args: CensusBuildArgs) -> int: - """ + """Build. + Approximately, build steps are: 1. Download manifest and copy/stage all source assets 2. Read all H5AD and create axis dataframe (serial) @@ -61,13 +60,11 @@ def build(args: CensusBuildArgs) -> int: 3. Read all H5AD assets again, write X layer (parallel) 4. Optional: validate - Returns - ------- + Returns: int Process completion code, 0 on success, non-zero indicating error, suitable for providing to sys.exit() """ - experiment_builders = make_experiment_builders() prepare_file_system(args) @@ -109,13 +106,13 @@ def build(args: CensusBuildArgs) -> int: return 0 -def prune_unused_datasets(assets_path: pathlib.Path, all_datasets: List[Dataset], used_datasets: List[Dataset]) -> None: - """Remove any staged H5AD not used to build the SOMA object, ie. those which do not contribute at least one cell to the Census""" - used_dataset_ids = set(d.dataset_id for d in used_datasets) +def prune_unused_datasets(assets_path: pathlib.Path, all_datasets: list[Dataset], used_datasets: list[Dataset]) -> None: + """Remove any staged H5AD not used to build the SOMA object, ie. those which do not contribute at least one cell to the Census.""" + used_dataset_ids = set(d.dataset_id for d in used_datasets) # noqa: C401 unused_datasets = [d for d in all_datasets if d.dataset_id not in used_dataset_ids] assert all(d.dataset_total_cell_count == 0 for d in unused_datasets) assert all(d.dataset_total_cell_count > 0 for d in used_datasets) - assert used_dataset_ids.isdisjoint(set(d.dataset_id for d in unused_datasets)) + assert used_dataset_ids.isdisjoint(set(d.dataset_id for d in unused_datasets)) # noqa: C401 for d in unused_datasets: logger.debug(f"Removing unused H5AD {d.dataset_h5ad_path}") @@ -123,14 +120,12 @@ def prune_unused_datasets(assets_path: pathlib.Path, all_datasets: List[Dataset] def populate_root_collection(root_collection: soma.Collection) -> soma.Collection: - """ - Create the root SOMA collection for the Census. + """Create the root SOMA collection for the Census. Returns the root collection. """ - # Set root metadata for the experiment - root_collection.metadata["created_on"] = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") + root_collection.metadata["created_on"] = datetime.now(tz=UTC).isoformat(timespec="seconds") sha = get_git_commit_sha() root_collection.metadata["git_commit_sha"] = sha @@ -142,7 +137,7 @@ def populate_root_collection(root_collection: soma.Collection) -> soma.Collectio return root_collection -def build_step1_get_source_datasets(args: CensusBuildArgs) -> List[Dataset]: +def build_step1_get_source_datasets(args: CensusBuildArgs) -> list[Dataset]: logger.info("Build step 1 - get source assets - started") # Load manifest defining the datasets @@ -167,8 +162,8 @@ def build_step1_get_source_datasets(args: CensusBuildArgs) -> List[Dataset]: def accumulate_axes( - assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: CensusBuildArgs -) -> List[Dataset]: + assets_path: str, datasets: list[Dataset], experiment_builders: list[ExperimentBuilder], args: CensusBuildArgs +) -> list[Dataset]: filtered_datasets = [] N = len(datasets) * len(experiment_builders) n = 0 @@ -212,9 +207,8 @@ def accumulate_axes( return filtered_datasets -def build_step2_create_root_collection(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> soma.Collection: - """ - Create all objects +def build_step2_create_root_collection(soma_path: str, experiment_builders: list[ExperimentBuilder]) -> soma.Collection: + """Create all objects. Returns: the root collection. """ @@ -231,11 +225,9 @@ def build_step2_create_root_collection(soma_path: str, experiment_builders: List def build_step3_populate_obs_and_var_axes( - assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: CensusBuildArgs -) -> List[Dataset]: - """ - Populate obs and var axes. Filter cells from datasets for each experiment, as obs is built. - """ + assets_path: str, datasets: list[Dataset], experiment_builders: list[ExperimentBuilder], args: CensusBuildArgs +) -> list[Dataset]: + """Populate obs and var axes. Filter cells from datasets for each experiment, as obs is built.""" logger.info("Build step 3 - Populate obs and var axes - started") filtered_datasets = accumulate_axes(assets_path, datasets, experiment_builders, args) @@ -253,13 +245,11 @@ def build_step3_populate_obs_and_var_axes( def build_step4_populate_X_layers( assets_path: str, - filtered_datasets: List[Dataset], - experiment_builders: List[ExperimentBuilder], + filtered_datasets: list[Dataset], + experiment_builders: list[ExperimentBuilder], args: CensusBuildArgs, ) -> None: - """ - Populate X layers. - """ + """Populate X layers.""" logger.info("Build step 4 - Populate X layers - started") # Process all X data @@ -276,8 +266,8 @@ def build_step4_populate_X_layers( def build_step5_save_axis_and_summary_info( root_collection: soma.Collection, - experiment_builders: List[ExperimentBuilder], - filtered_datasets: List[Dataset], + experiment_builders: list[ExperimentBuilder], + filtered_datasets: list[Dataset], build_tag: str, ) -> None: logger.info("Build step 5 - Save axis and summary info - started") @@ -295,7 +285,7 @@ def build_step5_save_axis_and_summary_info( def build_step6_save_derived_data( - root_collection: soma.Collection, experiment_builders: List[ExperimentBuilder], args: CensusBuildArgs + root_collection: soma.Collection, experiment_builders: list[ExperimentBuilder], args: CensusBuildArgs ) -> None: logger.info("Build step 6 - Creating derived objects - started") @@ -311,8 +301,7 @@ def build_step6_save_derived_data( def tiledb_soma_1969_work_around(census_uri: str) -> None: - """See single-cell-data/TileDB-SOMA#1969 and other issues related. Remove any inserted bounding box metadata""" - + """See single-cell-data/TileDB-SOMA#1969 and other issues related. Remove any inserted bounding box metadata.""" bbox_metadata_keys = [ "soma_dim_0_domain_lower", "soma_dim_0_domain_upper", @@ -320,7 +309,7 @@ def tiledb_soma_1969_work_around(census_uri: str) -> None: "soma_dim_1_domain_upper", ] - def _walk_tree(C: soma.Collection) -> List[str]: + def _walk_tree(C: soma.Collection) -> list[str]: assert C.soma_type in ["SOMACollection", "SOMAExperiment", "SOMAMeasurement"] uris = [] for soma_obj in C.values(): diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/census_summary.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/census_summary.py index 9d9e5501d..84cc677b2 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/census_summary.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/census_summary.py @@ -1,5 +1,5 @@ import logging -from typing import Sequence +from collections.abc import Sequence import pandas as pd import pyarrow as pa diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/consolidate.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/consolidate.py index 5457285a8..69f6cd509 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/consolidate.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/consolidate.py @@ -1,7 +1,7 @@ import logging import re +from collections.abc import Sequence from concurrent.futures import Executor, Future, as_completed -from typing import List, Optional, Sequence import attrs import tiledb @@ -47,11 +47,10 @@ def submit_consolidate( uri: str, pool: Executor, vacuum: bool, - include: Optional[Sequence[str]] = None, - exclude: Optional[Sequence[str]] = None, + include: Sequence[str] | None = None, + exclude: Sequence[str] | None = None, ) -> Sequence[Future[str]]: - """ - This is a non-portable, TileDB-specific consolidation routine. Returns sequence of + """This is a non-portable, TileDB-specific consolidation routine. Returns sequence of futures, each of which returns the URI for the array/group. Will vacuum if requested. Excludes any object URI matching a regex in the exclude list. @@ -73,7 +72,7 @@ def submit_consolidate( return futures -def _gather(uri: str) -> List[ConsolidationCandidate]: +def _gather(uri: str) -> list[ConsolidationCandidate]: # Gather URIs for any arrays that potentially need consolidation with soma.Collection.open(uri, context=SOMA_TileDB_Context()) as census: uris_to_consolidate = list_uris_to_consolidate(census) @@ -82,10 +81,8 @@ def _gather(uri: str) -> List[ConsolidationCandidate]: def list_uris_to_consolidate( collection: soma.Collection, -) -> List[ConsolidationCandidate]: - """ - Recursively walk the soma.Collection and return all uris for soma_types that can be consolidated and vacuumed. - """ +) -> list[ConsolidationCandidate]: + """Recursively walk the soma.Collection and return all uris for soma_types that can be consolidated and vacuumed.""" uris = [] for soma_obj in collection.values(): type = soma_obj.soma_type diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/datasets.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/datasets.py index 558db3ea6..0ef5c4ba0 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/datasets.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/datasets.py @@ -1,6 +1,6 @@ import dataclasses import logging -from typing import List, Type, TypeVar +from typing import TypeVar import pandas as pd import pyarrow as pa @@ -15,9 +15,7 @@ @dataclasses.dataclass # TODO: use attrs class Dataset: - """ - Type used to handle source H5AD datasets read from manifest - """ + """Type used to handle source H5AD datasets read from manifest.""" # Required dataset_id: str # CELLxGENE dataset_id @@ -43,35 +41,31 @@ class Dataset: soma_joinid: int = -1 def __post_init__(self) -> None: - """ - Type contracts - downstream code assume these types, so enforce it. - """ + """Type contracts - downstream code assume these types, so enforce it.""" for f in dataclasses.fields(self): assert isinstance( getattr(self, f.name), f.type ), f"{f.name} has incorrect type, expected {f.type}, got {type(getattr(self,f.name))}" @classmethod - def to_dataframe(cls: Type[T], datasets: List[T]) -> pd.DataFrame: + def to_dataframe(cls: type[T], datasets: list[T]) -> pd.DataFrame: if len(datasets) == 0: return pd.DataFrame({field.name: pd.Series(dtype=field.type) for field in dataclasses.fields(cls)}) return pd.DataFrame(datasets) @classmethod - def from_dataframe(cls: Type[T], datasets: pd.DataFrame) -> List["Dataset"]: + def from_dataframe(cls: type[T], datasets: pd.DataFrame) -> list["Dataset"]: return [Dataset(**r) for r in datasets.to_dict("records")] # type: ignore[misc] -def assign_dataset_soma_joinids(datasets: List[Dataset]) -> None: +def assign_dataset_soma_joinids(datasets: list[Dataset]) -> None: for joinid, dataset in enumerate(datasets): dataset.soma_joinid = joinid -def create_dataset_manifest(info_collection: soma.Collection, datasets: List[Dataset]) -> None: - """ - Write the Census `census_datasets` dataframe - """ +def create_dataset_manifest(info_collection: soma.Collection, datasets: list[Dataset]) -> None: + """Write the Census `census_datasets` dataframe.""" logger.info("Creating dataset_manifest") manifest_df = Dataset.to_dataframe(datasets) manifest_df = manifest_df[list(CENSUS_DATASETS_TABLE_SPEC.field_names())] diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py index c5e2a79a4..e53c62ffc 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py @@ -1,16 +1,11 @@ import concurrent.futures import gc import logging +from collections.abc import Generator, Sequence from contextlib import ExitStack from typing import ( - Dict, - Generator, - List, - Optional, - Sequence, - Tuple, + Self, TypedDict, - Union, overload, ) @@ -25,7 +20,6 @@ import tiledbsoma as soma from scipy import sparse from somacore.options import OpenMode -from typing_extensions import Self from ..build_state import CensusBuildArgs from ..util import log_process_resource_status, urlcat @@ -80,11 +74,11 @@ class AxisStats: var_stats: pd.DataFrame -AccumulateXResult = Tuple[PresenceResult, AxisStats] +AccumulateXResult = tuple[PresenceResult, AxisStats] AccumulateXResults = Sequence[AccumulateXResult] -def _assert_open_for_write(obj: Optional[somacore.SOMAObject]) -> None: +def _assert_open_for_write(obj: somacore.SOMAObject | None) -> None: assert obj is not None assert obj.exists(obj.uri) assert obj.mode == "w" @@ -93,8 +87,7 @@ def _assert_open_for_write(obj: Optional[somacore.SOMAObject]) -> None: @attrs.define(frozen=True) class ExperimentSpecification: - """ - Declarative "specification" of a SOMA experiment. This is a read-only + """Declarative "specification" of a SOMA experiment. This is a read-only specification, independent of the datasets used to build the census. Parameters: @@ -119,8 +112,7 @@ def create( class ExperimentBuilder: - """ - Class that embodies the operators and state to build an Experiment. + """Class that embodies the operators and state to build an Experiment. The creation and driving of these objects is done by the main loop. """ @@ -133,15 +125,15 @@ def __init__(self, specification: ExperimentSpecification): self.n_var: int = 0 self.n_datasets: int = 0 self.n_donors: int = 0 # Caution: defined as (unique dataset_id, donor_id) tuples, *excluding* some values - self.obs_df_accumulation: List[pd.DataFrame] = [] - self.obs_df: Optional[pd.DataFrame] = None - self.var_df: Optional[pd.DataFrame] = None - self.dataset_obs_joinid_start: Dict[str, int] = {} + self.obs_df_accumulation: list[pd.DataFrame] = [] + self.obs_df: pd.DataFrame | None = None + self.var_df: pd.DataFrame | None = None + self.dataset_obs_joinid_start: dict[str, int] = {} self.census_summary_cell_counts: pd.DataFrame = init_summary_counts_accumulator() - self.experiment: Optional[soma.Experiment] = None # initialized in create() - self.experiment_uri: Optional[str] = None # initialized in create() - self.global_var_joinids: Optional[pd.DataFrame] = None - self.presence: Dict[int, Tuple[npt.NDArray[np.bool_], npt.NDArray[np.int64]]] = {} + self.experiment: soma.Experiment | None = None # initialized in create() + self.experiment_uri: str | None = None # initialized in create() + self.global_var_joinids: pd.DataFrame | None = None + self.presence: dict[int, tuple[npt.NDArray[np.bool_], npt.NDArray[np.int64]]] = {} @property def name(self) -> str: @@ -153,7 +145,6 @@ def anndata_cell_filter_spec(self) -> AnnDataFilterSpec: def create(self, census_data: soma.Collection) -> None: """Create experiment within the specified Collection with a single Measurement.""" - logger.info(f"{self.name}: create experiment at {urlcat(census_data.uri, self.name)}") self.experiment = census_data.add_new_collection(self.name, soma.Experiment) @@ -165,17 +156,15 @@ def create(self, census_data: soma.Collection) -> None: # make measurement and add to ms collection ms.add_new_collection(MEASUREMENT_RNA_NAME, soma.Measurement) - def filter_anndata_cells(self, ad: anndata.AnnData) -> Union[None, anndata.AnnData]: + def filter_anndata_cells(self, ad: anndata.AnnData) -> None | anndata.AnnData: anndata_cell_filter = make_anndata_cell_filter(self.anndata_cell_filter_spec) return anndata_cell_filter(ad) def accumulate_axes(self, dataset: Dataset, ad: anndata.AnnData) -> int: - """ - Build (accumulate) in-memory obs and var. + """Build (accumulate) in-memory obs and var. Returns: number of cells that make it past the experiment filter. """ - assert len(ad.obs) > 0 # Narrow columns just to minimize memory footprint. Summary cell counting @@ -297,9 +286,7 @@ def populate_var_axis(self) -> None: self.n_var = 0 def create_X_with_layers(self) -> None: - """ - Create layers in ms['RNA']/X - """ + """Create layers in ms['RNA']/X.""" logger.info(f"{self.name}: create X layers") rna_measurement = self.experiment.ms[MEASUREMENT_RNA_NAME] # type:ignore @@ -319,10 +306,8 @@ def create_X_with_layers(self) -> None: platform_config=CENSUS_X_LAYERS_PLATFORM_CONFIG[layer_name], ) - def populate_presence_matrix(self, datasets: List[Dataset]) -> None: - """ - Save presence matrix per Experiment - """ + def populate_presence_matrix(self, datasets: list[Dataset]) -> None: + """Save presence matrix per Experiment.""" _assert_open_for_write(self.experiment) logger.info(f"Save presence matrix for {self.name} - start") @@ -369,10 +354,7 @@ def write_X_normalized(self, args: CensusBuildArgs) -> None: WRITE_NORM_STRIDE = 2**18 # controls TileDB fragment size, which impacts consolidation time mem_budget = ( # (20 bytes per COO X stride X typical-nnz X overhead) + static-allocation + passed-data-size - int(20 * WRITE_NORM_STRIDE * 4000 * 2) - + (3 * 1024**3) - + feature_length.nbytes - + is_smart_seq.nbytes + int(20 * WRITE_NORM_STRIDE * 4000 * 2) + (3 * 1024**3) + feature_length.nbytes + is_smart_seq.nbytes ) n_workers = n_workers_from_memory_budget(args, mem_budget) with create_process_pool_executor(args, max_workers=n_workers) as pe: @@ -398,12 +380,11 @@ def write_X_normalized(self, args: CensusBuildArgs) -> None: def _get_axis_stats( - raw_X: Union[sparse.spmatrix, npt.NDArray[np.float32]], + raw_X: sparse.spmatrix | npt.NDArray[np.float32], dataset_obs_joinid_start: int, local_var_joinids: npt.NDArray[np.int64], -) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Generate obs and var summary stats, e.g., raw_sum, etc. +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Generate obs and var summary stats, e.g., raw_sum, etc. Return tuple of (obs_stats_df, var_stats_df), both indexed by soma_joinid. """ @@ -431,7 +412,7 @@ class AccumXEBParams: n_obs: int n_var: int anndata_cell_filter_spec: AnnDataFilterSpec - global_var_joinids: Optional[pd.DataFrame] + global_var_joinids: pd.DataFrame | None experiment_uri: str @@ -442,14 +423,14 @@ class AccumXEBParams: def _accumulate_all_X_layers( assets_path: str, dataset: Dataset, - experiment_builders: List[AccumXEBParams], - dataset_obs_joinid_starts: List[Union[None, int]], + experiment_builders: list[AccumXEBParams], + dataset_obs_joinid_starts: list[None | int], ms_name: str, - progress: Tuple[int, int], + progress: tuple[int, int], ) -> AccumulateXResults: - """ - For this dataset, save all X layer information for each Experiment. This currently - includes: + """For this dataset, save all X layer information for each Experiment. + + This currently includes: X['raw'] - raw counts Also accumulates presence information per dataset. @@ -459,8 +440,8 @@ def _accumulate_all_X_layers( logger.info(f"Saving X layer for dataset - start {dataset.dataset_id} ({progress[0]} of {progress[1]})") unfiltered_ad = open_anndata(assets_path, dataset, include_filter_columns=True, var_column_names=("_index",)) - results: List[AccumulateXResult] = [] - for eb, dataset_obs_joinid_start in zip(experiment_builders, dataset_obs_joinid_starts): + results: list[AccumulateXResult] = [] + for eb, dataset_obs_joinid_start in zip(experiment_builders, dataset_obs_joinid_starts, strict=False): if dataset_obs_joinid_start is None: # this dataset has no data for this experiment continue @@ -577,8 +558,8 @@ def _accumulate_all_X_layers( def _accumulate_X( assets_path: str, dataset: Dataset, - experiment_builders: List["ExperimentBuilder"], - progress: Tuple[int, int], + experiment_builders: list["ExperimentBuilder"], + progress: tuple[int, int], ) -> AccumulateXResults: ... @@ -587,9 +568,9 @@ def _accumulate_X( def _accumulate_X( assets_path: str, dataset: Dataset, - experiment_builders: List["ExperimentBuilder"], - progress: Tuple[int, int], - executor: Optional[ResourcePoolProcessExecutor], + experiment_builders: list["ExperimentBuilder"], + progress: tuple[int, int], + executor: ResourcePoolProcessExecutor | None, ) -> concurrent.futures.Future[AccumulateXResults]: ... @@ -597,15 +578,13 @@ def _accumulate_X( def _accumulate_X( assets_path: str, dataset: Dataset, - experiment_builders: List["ExperimentBuilder"], - progress: Tuple[int, int], - executor: Optional[ResourcePoolProcessExecutor] = None, -) -> Union[concurrent.futures.Future[AccumulateXResults], AccumulateXResults]: - """ - Save X layer data for a single AnnData, for all Experiments. Return a future if + experiment_builders: list["ExperimentBuilder"], + progress: tuple[int, int], + executor: ResourcePoolProcessExecutor | None = None, +) -> concurrent.futures.Future[AccumulateXResults] | AccumulateXResults: + """Save X layer data for a single AnnData, for all Experiments. Return a future if executor is specified, otherwise immediately do the work. """ - # build params to pass to child workers - this avoids pickling unecessary # data (or data that can't be pickled) eb_params = [] @@ -668,16 +647,14 @@ def _accumulate_X( def populate_X_layers( assets_path: str, - datasets: List[Dataset], - experiment_builders: List[ExperimentBuilder], + datasets: list[Dataset], + experiment_builders: list[ExperimentBuilder], args: CensusBuildArgs, ) -> None: - """ - Do all X layer processing for all Experiments. Also accumulate presence matrix data for later writing. - """ + """Do all X layer processing for all Experiments. Also accumulate presence matrix data for later writing.""" # populate X layers logger.debug("populate_X_layers begin") - results: List[AccumulateXResult] = [] + results: list[AccumulateXResult] = [] if args.config.multi_process: # reserve memory to accumulate the stats n_obs = sum(eb.n_obs for eb in experiment_builders) @@ -739,7 +716,7 @@ def populate_X_layers( class SummaryStats(TypedDict): total_cell_count: int unique_cell_count: int - number_donors: Dict[str, int] + number_donors: dict[str, int] def get_summary_stats(experiment_builders: Sequence[ExperimentBuilder]) -> SummaryStats: @@ -751,8 +728,7 @@ def get_summary_stats(experiment_builders: Sequence[ExperimentBuilder]) -> Summa def add_tissue_mapping(obs_df: pd.DataFrame, dataset_id: str) -> None: - """Inplace addition of tissue_general-related columns""" - + """Inplace addition of tissue_general-related column.""" # UBERON tissue term mapper from .tissue_mapper import TissueMapper # type: ignore @@ -774,10 +750,9 @@ def add_tissue_mapping(obs_df: pd.DataFrame, dataset_id: str) -> None: def reopen_experiment_builders( - experiment_builders: List[ExperimentBuilder], mode: OpenMode = "w" + experiment_builders: list[ExperimentBuilder], mode: OpenMode = "w" ) -> Generator[ExperimentBuilder, None, None]: - """ - Re-opens all ExperimentBuilder's `experiment` for writing as a Generator, allowing iterating code to use + """Re-opens all ExperimentBuilder's `experiment` for writing as a Generator, allowing iterating code to use the experiment for writing, without having to explicitly close it. """ with ExitStack() as experiments_stack: @@ -791,9 +766,8 @@ def reopen_experiment_builders( yield eb -def _write_X_normalized(args: Tuple[str, int, int, npt.NDArray[np.int64], npt.NDArray[np.bool_]]) -> None: - """ - Helper for ExperimentBuilder.write_X_normalized. +def _write_X_normalized(args: tuple[str, int, int, npt.NDArray[np.int64], npt.NDArray[np.bool_]]) -> None: + """Helper for ExperimentBuilder.write_X_normalized. Read indicated rows from X['raw'], write to X['normalized'] """ @@ -862,8 +836,7 @@ def _normalize( @numba.jit(nopython=True, nogil=True) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 def _roundHalfToEven(a: npt.NDArray[np.float32], keepbits: int) -> npt.NDArray[np.float32]: - """ - Generate reduced precision floating point array, with round half to even. + """Generate reduced precision floating point array, with round half to even. IMPORANT: In-place operation. Ref: https://gmd.copernicus.org/articles/14/377/2021/gmd-14-377-2021.html diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py index 86aca574b..649a12617 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py @@ -1,14 +1,12 @@ import functools -from typing import List from .experiment_builder import ExperimentBuilder, ExperimentSpecification from .globals import RNA_SEQ @functools.cache -def make_experiment_specs() -> List[ExperimentSpecification]: - """ - Define all soma.Experiments to build in the census. +def make_experiment_specs() -> list[ExperimentSpecification]: + """Define all soma.Experiments to build in the census. Functionally, this defines per-experiment name, anndata filter, etc. It also loads any required per-Experiment assets. @@ -16,15 +14,21 @@ def make_experiment_specs() -> List[ExperimentSpecification]: return [ # The soma.Experiments we want to build ExperimentSpecification.create( name="homo_sapiens", - anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ), + anndata_cell_filter_spec={ + "organism_ontology_term_id": "NCBITaxon:9606", + "assay_ontology_term_ids": RNA_SEQ, + }, ), ExperimentSpecification.create( name="mus_musculus", - anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ), + anndata_cell_filter_spec={ + "organism_ontology_term_id": "NCBITaxon:10090", + "assay_ontology_term_ids": RNA_SEQ, + }, ), ] @functools.cache -def make_experiment_builders() -> List[ExperimentBuilder]: +def make_experiment_builders() -> list[ExperimentBuilder]: return [ExperimentBuilder(spec) for spec in make_experiment_specs()] diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py index edfbebcc2..5d99b813a 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py @@ -1,5 +1,5 @@ import functools -from typing import Any, List, Set, Tuple, Union +from typing import Any import pyarrow as pa import tiledbsoma as soma @@ -101,13 +101,13 @@ "tissue_ontology_term_id", "tissue_type", ] -CXG_OBS_COLUMNS_READ: Tuple[str, ...] = ( # Columns READ from the CXG H5AD - see open_anndata() +CXG_OBS_COLUMNS_READ: tuple[str, ...] = ( # Columns READ from the CXG H5AD - see open_anndata() *CXG_OBS_TERM_COLUMNS, "organism", "organism_ontology_term_id", ) CENSUS_OBS_STATS_COLUMNS = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"] -CENSUS_OBS_FIELDS: List[Union[FieldSpec, Tuple[str, pa.DataType]]] = [ +CENSUS_OBS_FIELDS: list[FieldSpec | tuple[str, pa.DataType]] = [ ("soma_joinid", pa.int64()), FieldSpec(name="dataset_id", type=pa.large_string(), is_dictionary=True), FieldSpec(name="assay", type=pa.large_string(), is_dictionary=True), @@ -158,7 +158,7 @@ if f.name not in (_DictLikeObsAttrs + _NumericObsAttrs + ["soma_joinid"]) ] # Dict filter varies depending on whether we are using dictionary types in the schema -_DictLikeFilter: List[Any] = ( +_DictLikeFilter: list[Any] = ( [{"_type": "ZstdFilter", "level": 19}] if USE_ARROW_DICTIONARY else ["DictionaryFilter", {"_type": "ZstdFilter", "level": 19}] @@ -181,7 +181,7 @@ } } -CXG_VAR_COLUMNS_READ: Tuple[str, ...] = ( +CXG_VAR_COLUMNS_READ: tuple[str, ...] = ( "_index", "feature_name", "feature_length", @@ -290,7 +290,7 @@ # Feature_reference values which are ignored (not considered) in # multi-organism filtering. Currently the null set. -FEATURE_REFERENCE_IGNORE: Set[str] = set() +FEATURE_REFERENCE_IGNORE: set[str] = set() # The default configuration for TileDB contexts used in the builder. diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py index 6f64802b6..9740706f3 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/manifest.py @@ -2,7 +2,6 @@ import io import logging import os.path -from typing import List, Set import fsspec @@ -15,17 +14,15 @@ CXG_BASE_URI = "https://api.cellxgene.cziscience.com/" -def parse_manifest_file(manifest_fp: io.TextIOBase) -> List[Dataset]: - """ - return manifest as list of tuples, (dataset_id, URI/path), read from the text stream - """ +def parse_manifest_file(manifest_fp: io.TextIOBase) -> list[Dataset]: + """Return manifest as list of tuples, (dataset_id, URI/path), read from the text stream.""" # skip comments and strip leading/trailing white space skip_comments = csv.reader(row for row in manifest_fp if not row.startswith("#")) stripped = [[r.strip() for r in row] for row in skip_comments] return [Dataset(dataset_id=r[0], dataset_asset_h5ad_uri=r[1]) for r in stripped] -def dedup_datasets(datasets: List[Dataset]) -> List[Dataset]: +def dedup_datasets(datasets: list[Dataset]) -> list[Dataset]: ds = {d.dataset_id: d for d in datasets} if len(ds) != len(datasets): logger.warning("Dataset manifest contained DUPLICATES, which will be ignored.") @@ -33,7 +30,7 @@ def dedup_datasets(datasets: List[Dataset]) -> List[Dataset]: return datasets -def load_manifest_from_fp(manifest_fp: io.TextIOBase) -> List[Dataset]: +def load_manifest_from_fp(manifest_fp: io.TextIOBase) -> list[Dataset]: logger.info("Loading manifest from file") all_datasets = parse_manifest_file(manifest_fp) datasets = [ @@ -52,7 +49,7 @@ def null_to_empty_str(val: str | None) -> str: return val -def load_manifest_from_CxG() -> List[Dataset]: +def load_manifest_from_CxG() -> list[Dataset]: logger.info("Loading manifest from CELLxGENE data portal...") # Load all collections and extract dataset_id @@ -104,8 +101,8 @@ def load_manifest_from_CxG() -> List[Dataset]: return response -def load_blocklist(dataset_id_blocklist_uri: str | None) -> Set[str]: - blocked_dataset_ids: Set[str] = set() +def load_blocklist(dataset_id_blocklist_uri: str | None) -> set[str]: + blocked_dataset_ids: set[str] = set() if not dataset_id_blocklist_uri: msg = "No dataset blocklist specified - builder is misconfigured" logger.error(msg) @@ -124,7 +121,7 @@ def load_blocklist(dataset_id_blocklist_uri: str | None) -> Set[str]: return blocked_dataset_ids -def apply_blocklist(datasets: List[Dataset], dataset_id_blocklist_uri: str | None) -> List[Dataset]: +def apply_blocklist(datasets: list[Dataset], dataset_id_blocklist_uri: str | None) -> list[Dataset]: try: blocked_dataset_ids = load_blocklist(dataset_id_blocklist_uri) return list(filter(lambda d: d.dataset_id not in blocked_dataset_ids, datasets)) @@ -138,9 +135,8 @@ def apply_blocklist(datasets: List[Dataset], dataset_id_blocklist_uri: str | Non def load_manifest( manifest_fp: str | io.TextIOBase | None = None, dataset_id_blocklist_uri: str | None = None, -) -> List[Dataset]: - """ - Load dataset manifest from the file pointer if provided, else bootstrap +) -> list[Dataset]: + """Load dataset manifest from the file pointer if provided, else bootstrap from the CELLxGENE REST API. Apply the blocklist if provided. """ if manifest_fp is not None: diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/mp.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/mp.py index efba7c9ce..fd7fea344 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/mp.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/mp.py @@ -4,20 +4,15 @@ import threading import weakref from collections import deque +from collections.abc import Callable, Iterable, Iterator, Mapping from concurrent.futures import Executor, Future, ProcessPoolExecutor, ThreadPoolExecutor from functools import partial from types import TracebackType from typing import ( Any, - Callable, Generic, - Iterable, - Iterator, Literal, - Mapping, - Optional, ParamSpec, - Tuple, TypeVar, Union, ) @@ -41,8 +36,7 @@ def _mp_config_checks() -> bool: def _hard_process_cap(args: CensusBuildArgs, n_proc: int) -> int: - """ - Enforce the configured worker process limit. + """Enforce the configured worker process limit. NOTE: logic below only enforces this limit in cases using the default worker count, as there are special cases where we want higher limits, due to special knowledge that we @@ -57,15 +51,15 @@ def _default_worker_process_count(args: CensusBuildArgs) -> int: def n_workers_from_memory_budget(args: CensusBuildArgs, per_worker_budget: int) -> int: - """Trivial helper to estimate appropriate number of fixed-memory-budget workers from total memory available""" + """Trivial helper to estimate appropriate number of fixed-memory-budget workers from total memory available.""" n_workers: int = int(args.config.memory_budget // per_worker_budget) return min(n_workers, _default_worker_process_count(args)) def create_process_pool_executor( args: CensusBuildArgs, - max_workers: Optional[int] = None, - max_tasks_per_child: Optional[int] = None, + max_workers: int | None = None, + max_tasks_per_child: int | None = None, ) -> ProcessPoolExecutor: assert _mp_config_checks() if max_workers is None: @@ -77,15 +71,14 @@ def create_process_pool_executor( ) -def create_thread_pool_executor(max_workers: Optional[int] = None) -> ThreadPoolExecutor: +def create_thread_pool_executor(max_workers: int | None = None) -> ThreadPoolExecutor: assert _mp_config_checks() logger.debug(f"create_thread_pool_executor [max_workers={max_workers}]") return ThreadPoolExecutor(max_workers=max_workers) def log_on_broken_process_pool(ppe: Union[ProcessPoolExecutor, "ResourcePoolProcessExecutor"]) -> None: - """ - There are a number of conditions where the Process Pool can be broken, + """There are a number of conditions where the Process Pool can be broken, such that it will hang in a shutdown. This will cause the context __exit__ to hang indefinitely, as it calls ProcessPoolExecutor.shutdown with `wait=True`. @@ -99,7 +92,6 @@ def log_on_broken_process_pool(ppe: Union[ProcessPoolExecutor, "ResourcePoolProc Caution: uses ProcessPoolExecutor internal API, as this state is not otherwise visible. """ - if ppe._broken: logger.critical(f"Process pool broken and may fail or hang: {ppe._broken}") @@ -118,13 +110,13 @@ class EagerIterator(Iterator[_T]): def __init__( self, iterator: Iterator[_T], - pool: Optional[Executor] = None, + pool: Executor | None = None, ): super().__init__() self.iterator = iterator self._pool = pool or ThreadPoolExecutor() self._own_pool = pool is None - self._future: Optional[Future[_T]] = None + self._future: Future[_T] | None = None self._fetch_next() def _fetch_next(self) -> None: @@ -164,7 +156,7 @@ class _WorkItem(Generic[_T]): kwargs: Mapping[str, Any] -_MightBeWork = Tuple[bool, Optional[_WorkItem[Any]]] +_MightBeWork = tuple[bool, _WorkItem[Any] | None] _SchedulerMethod = Literal["best-fit", "first-fit"] @@ -202,8 +194,7 @@ def submit(self, wi: _WorkItem[_T]) -> Future[_T]: return f def _get_work(self) -> _MightBeWork: - """ - Get next work item to schedule. + """Get next work item to schedule. IMPORTANT: caller MUST own scheduler _condition lock to call this. @@ -216,7 +207,6 @@ def _get_work(self) -> _MightBeWork: def _get_next_work() -> int | None: """Return index of "best" work item to scheudle, or None if work is unavailable.""" - # Best fit: return the largest resource consumer that fits in available space max_available_resources = self.max_resources - self.resources_in_use candidate_work = filter(lambda v: v[1].resources <= max_available_resources, enumerate(self._pending_work)) @@ -271,7 +261,7 @@ def _work_item_done( scheduler._release_resources(wi) def _schedule_work(self, work: _WorkItem[Any]) -> None: - """must hold lock""" + """Must hold lock.""" executor = self.executor_ref() if executor is None: # can happen if the ResourcePoolExecutor was collected @@ -301,8 +291,7 @@ def _debug_msg(self, msg: str) -> None: class ResourcePoolProcessExecutor(contextlib.AbstractContextManager["ResourcePoolProcessExecutor"]): - """ - Provides a ProcessPoolExecutor-like API, scheduling based upon static "resource" reservation + """Provides a ProcessPoolExecutor-like API, scheduling based upon static "resource" reservation requests. A "resource" is any shared capacity or resource, expressed as an integer value. Class holds a queue of "work items", scheduling them into an actual ProcessPoolExecutor when sufficient resources are available. @@ -348,7 +337,7 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False) -> None: self.process_pool.close() def __exit__( - self, exc_type: Optional[type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType] + self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None ) -> None: self.shutdown(wait=True) return None @@ -356,9 +345,9 @@ def __exit__( def create_resource_pool_executor( args: CensusBuildArgs, - max_resources: Optional[int] = None, - max_workers: Optional[int] = None, - max_tasks_per_child: Optional[int] = None, + max_resources: int | None = None, + max_workers: int | None = None, + max_tasks_per_child: int | None = None, ) -> ResourcePoolProcessExecutor: assert _mp_config_checks() diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/schema_util.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/schema_util.py index 93dc106d8..5d210dc6c 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/schema_util.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/schema_util.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import List, Optional, Sequence, Tuple, TypeVar, Union, cast +from collections.abc import Sequence +from typing import TypeVar, cast import attrs import numpy.typing as npt @@ -17,7 +18,7 @@ """ -OptDataFrame = TypeVar("OptDataFrame", pd.DataFrame, None, Optional[pd.DataFrame]) +OptDataFrame = TypeVar("OptDataFrame", pd.DataFrame, None, pd.DataFrame | None) @attrs.define(frozen=True, kw_only=True, slots=True) @@ -31,8 +32,7 @@ class FieldSpec: is_dictionary: bool = False # dictionary or primitive field type def to_pandas_dtype(self, *, ignore_dict_type: bool = False) -> npt.DTypeLike: - """ - Return the Pandas dtype for this field. + """Return the Pandas dtype for this field. This is only possible if the field is not a dictionary-type field. @@ -44,8 +44,7 @@ def to_pandas_dtype(self, *, ignore_dict_type: bool = False) -> npt.DTypeLike: return cast(npt.DTypeLike, self.type.to_pandas_dtype()) def is_type_equivalent(self, other_type: pa.DataType, *, null_non_primitive_equivalence: bool = False) -> bool: - """ - Return True if this FieldSpec is equivalent to the Arrow `other_type`. + """Return True if this FieldSpec is equivalent to the Arrow `other_type`. For convenience in comparing with types inferred from Pandas DataFrames, where strings and other Arrow non-primitives are stored as objects, allow a pa.null DataType to be equivalent to Arrow non-primitive. @@ -91,19 +90,18 @@ def _check_type_compat(self, other_type: pa.DataType, empty_dataframe: bool) -> @attrs.define(frozen=True, kw_only=True, slots=True) class TableSpec: - """ - List of FieldSpec defining an Arrow Table, with a table-wide feature flag enabling/disabling + """List of FieldSpec defining an Arrow Table, with a table-wide feature flag enabling/disabling use of Dictionary types. Instantiate ONLY with the class method `create`. """ - fields: List[FieldSpec] + fields: list[FieldSpec] use_arrow_dictionaries: bool # Feature flag to enable/disable dictionary/enum support @classmethod def create( - cls, fields: Sequence[Union[FieldSpec, Tuple[str, pa.DataType]]], *, use_arrow_dictionary: bool = False + cls, fields: Sequence[FieldSpec | tuple[str, pa.DataType]], *, use_arrow_dictionary: bool = False ) -> TableSpec: u = [] for f in fields: @@ -116,14 +114,13 @@ def create( u.append(FieldSpec(name=name, type=type, is_dictionary=False)) # quick unique check - if len(set(f.name for f in u)) != len(fields): + if len(set(f.name for f in u)) != len(fields): # noqa: C401 raise ValueError("All field names must be unique.") return TableSpec(fields=u, use_arrow_dictionaries=use_arrow_dictionary) - def to_arrow_schema(self, df: Optional[pd.DataFrame] = None) -> pa.Schema: - """ - Returns Arrow schema for a Table. + def to_arrow_schema(self, df: pd.DataFrame | None = None) -> pa.Schema: + """Returns Arrow schema for a Table. Use the specified types, but check for equivalence. Where the field spec is a dictionary, create the narrowest possible dictionary index_type sufficient for @@ -151,8 +148,8 @@ def to_arrow_schema(self, df: Optional[pd.DataFrame] = None) -> pa.Schema: return pa.schema(pa_fields) def field_names(self) -> Sequence[str]: - """Return field names for this TableSpec as a sequence of string""" - return list(field.name for field in self.fields) + """Return field names for this TableSpec as a sequence of string.""" + return [field.name for field in self.fields] def field(self, key: str) -> FieldSpec: """Return the named field, or raise ValueError if no such key.""" @@ -163,7 +160,7 @@ def field(self, key: str) -> FieldSpec: return r[0] def recategoricalize(self, df: OptDataFrame) -> OptDataFrame: - """Apply/unapply categorical typing to match table schema spec""" + """Apply/unapply categorical typing to match table schema spec.""" if df is None or df.empty: return df diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/source_assets.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/source_assets.py index 9f2819b2a..d4bc47dbf 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/source_assets.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/source_assets.py @@ -2,7 +2,7 @@ import os import time import urllib.parse -from typing import List, Tuple, cast +from typing import cast import aiohttp import fsspec @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -def stage_source_assets(datasets: List[Dataset], args: CensusBuildArgs) -> None: +def stage_source_assets(datasets: list[Dataset], args: CensusBuildArgs) -> None: assets_dir = args.h5ads_path.as_posix() # e.g., "census-builder-prod/1.0.0" @@ -85,7 +85,7 @@ def _copy_file(n: int, dataset: Dataset, asset_dir: str, N: int, user_agent: str return dataset_file_name -def copy_file(args: Tuple[int, Dataset, str, int, str]) -> str: +def copy_file(args: tuple[int, Dataset, str, int, str]) -> str: return _copy_file(*args) diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/stats.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/stats.py index 6457832eb..4b8c7e913 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/stats.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/stats.py @@ -1,5 +1,3 @@ -from typing import Union - import numba import numpy as np import numpy.typing as npt @@ -10,10 +8,9 @@ def get_obs_stats( - raw_X: Union[sparse.csr_matrix, sparse.csc_matrix], + raw_X: sparse.csr_matrix | sparse.csc_matrix, ) -> pd.DataFrame: """Compute summary stats for obs axis, and return as a dataframe.""" - if not isinstance(raw_X, sparse.csr_matrix) and not isinstance(raw_X, sparse.csc_matrix): raise NotImplementedError(f"get_obs_stats: unsupported type {type(raw_X)}") @@ -38,7 +35,7 @@ def get_obs_stats( def get_var_stats( - raw_X: Union[sparse.csr_matrix, sparse.csc_matrix, npt.NDArray[np.float32]], + raw_X: sparse.csr_matrix | sparse.csc_matrix | npt.NDArray[np.float32], ) -> pd.DataFrame: if isinstance(raw_X, sparse.csr_matrix) or isinstance(raw_X, sparse.csc_matrix): nnz = raw_X.getnnz(axis=0) @@ -62,9 +59,9 @@ def get_var_stats( nopython=True, ) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 def _var_ndarray(data: npt.NDArray[np.float32], ddof: int) -> float: - """ - Return variance of an ndarray. Computed as variance of shifted distribution, - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + """Return variance of an ndarray. + + Computed as variance of shifted distribution, https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. """ n = len(data) if n < 2: @@ -111,7 +108,7 @@ def _var_matrix( def _var( - matrix: Union[sparse.csr_matrix, sparse.csc_matrix], + matrix: sparse.csr_matrix | sparse.csc_matrix, axis: int = 0, ddof: int = 1, ) -> npt.NDArray[np.float64]: diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/summary_cell_counts.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/summary_cell_counts.py index a1a4fd748..7e4c00010 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/summary_cell_counts.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/summary_cell_counts.py @@ -1,5 +1,5 @@ import logging -from typing import Sequence +from collections.abc import Sequence import numpy as np import pandas as pd @@ -14,9 +14,7 @@ def create_census_summary_cell_counts( info_collection: soma.Collection, per_experiment_summary: Sequence[pd.DataFrame] ) -> None: - """ - Save per-category counts as the census_summary_cell_counts SOMA dataframe - """ + """Save per-category counts as the census_summary_cell_counts SOMA dataframe.""" logger.info("Creating census_summary_cell_counts") df = ( pd.concat(per_experiment_summary, ignore_index=True) @@ -49,9 +47,7 @@ def init_summary_counts_accumulator() -> pd.DataFrame: def accumulate_summary_counts(current: pd.DataFrame, obs_df: pd.DataFrame) -> pd.DataFrame: - """ - Add summary counts to the census_summary_cell_counts dataframe - """ + """Add summary counts to the census_summary_cell_counts dataframe.""" assert "dataset_id" in obs_df if len(obs_df) == 0: diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/util.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/util.py index bfdd85b14..005ded1e5 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/util.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/util.py @@ -1,6 +1,7 @@ import os import time -from typing import Any, Iterator, Optional, Union +from collections.abc import Iterator +from typing import Any import numpy as np import numpy.typing as npt @@ -10,11 +11,10 @@ def array_chunker( - arr: Union[npt.NDArray[Any], sparse.spmatrix], - nnz_chunk_size: Optional[int] = 256 * 1024**2, # goal (~2.4GiB for a 32-bit COO) + arr: npt.NDArray[Any] | sparse.spmatrix, + nnz_chunk_size: int | None = 256 * 1024**2, # goal (~2.4GiB for a 32-bit COO) ) -> Iterator[sparse.coo_matrix]: - """ - Return the array as multiple chunks, each a coo_matrix. + """Return the array as multiple chunks, each a coo_matrix. The slicing is always done by row (for ndarray and csr_matrix) or by column (for csc_matrix), and will never split a row (or column) into two separate slices. @@ -30,7 +30,6 @@ def array_chunker( Raises: NotImplementedError: If the matrix type is not supported. """ - if isinstance(arr, sparse.csr_matrix) or isinstance(arr, sparse.csr_array): avg_nnz_per_row = arr.nnz // arr.shape[0] row_chunk_size = max(1, round(nnz_chunk_size / avg_nnz_per_row)) @@ -76,9 +75,8 @@ def fetch_json(url: str, delay_secs: float = 0.0) -> object: return response.json() -def is_nonnegative_integral(X: Union[npt.NDArray[np.floating[Any]], sparse.spmatrix]) -> bool: - """ - Return true if the matrix/array contains only positive integral values, +def is_nonnegative_integral(X: npt.NDArray[np.floating[Any]] | sparse.spmatrix) -> bool: + """Return true if the matrix/array contains only positive integral values, False otherwise. """ data = X if isinstance(X, np.ndarray) else X.data @@ -92,9 +90,7 @@ def is_nonnegative_integral(X: Union[npt.NDArray[np.floating[Any]], sparse.spmat def get_git_commit_sha() -> str: - """ - Returns the git commit SHA for the current repo - """ + """Returns the git commit SHA for the current repo.""" # Try to get the git commit SHA from the COMMIT_SHA env variable commit_sha_var = os.getenv("COMMIT_SHA") if commit_sha_var is not None: @@ -110,9 +106,7 @@ def get_git_commit_sha() -> str: def is_git_repo_dirty() -> bool: - """ - Returns True if the git repo is dirty, i.e. there are uncommitted changes - """ + """Returns True if the git repo is dirty, i.e. there are uncommitted changes.""" import git # Scoped import - this requires the git executable to exist on the machine # work around https://github.com/gitpython-developers/GitPython/issues/1349 diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py index f7b7afb18..c10efb2ba 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py @@ -7,7 +7,7 @@ import pathlib from dataclasses import dataclass from datetime import datetime -from typing import Any, Dict, List, Tuple, TypeVar, Union +from typing import Any, Self, TypeVar import numpy as np import numpy.typing as npt @@ -16,7 +16,6 @@ import tiledb import tiledbsoma as soma from scipy import sparse -from typing_extensions import Self from ..build_state import CensusBuildArgs from ..util import log_process_resource_status, urlcat @@ -56,7 +55,7 @@ @dataclass # TODO: use attrs class EbInfo: - """Class used to collect information about axis (for validation code)""" + """Class used to collect information about axis (for validation code).""" n_obs: int = 0 vars: set[str] = dataclasses.field(default_factory=set) @@ -78,9 +77,8 @@ def open_experiment(base_uri: str, eb: ExperimentSpecification) -> soma.Experime return soma.Experiment.open(urlcat(base_uri, CENSUS_DATA_NAME, eb.name), mode="r") -def validate_all_soma_objects_exist(soma_path: str, experiment_specifications: List[ExperimentSpecification]) -> bool: - """ - Validate all objects present and contain expected metadata. +def validate_all_soma_objects_exist(soma_path: str, experiment_specifications: list[ExperimentSpecification]) -> bool: + """Validate all objects present and contain expected metadata. soma_path +-- census_info: soma.Collection @@ -91,7 +89,6 @@ def validate_all_soma_objects_exist(soma_path: str, experiment_specifications: L | +-- homo_sapiens: soma.Experiment | +-- mus_musculus: soma.Experiment """ - with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census: assert soma.Collection.exists(census.uri) assert datetime.fromisoformat(census.metadata["created_on"]) @@ -161,13 +158,13 @@ def validate_all_soma_objects_exist(soma_path: str, experiment_specifications: L return True -def _validate_axis_dataframes(args: Tuple[str, str, Dataset, List[ExperimentSpecification]]) -> Dict[str, EbInfo]: +def _validate_axis_dataframes(args: tuple[str, str, Dataset, list[ExperimentSpecification]]) -> dict[str, EbInfo]: assets_path, soma_path, dataset, experiment_specifications = args with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census: census_data = census[CENSUS_DATA_NAME] dataset_id = dataset.dataset_id unfiltered_ad = open_anndata(assets_path, dataset) - eb_info: Dict[str, EbInfo] = {} + eb_info: dict[str, EbInfo] = {} for eb in experiment_specifications: eb_info[eb.name] = EbInfo() anndata_cell_filter = make_anndata_cell_filter(eb.anndata_cell_filter_spec) @@ -217,14 +214,13 @@ def _validate_axis_dataframes(args: Tuple[str, str, Dataset, List[ExperimentSpec def validate_axis_dataframes( assets_path: str, soma_path: str, - datasets: List[Dataset], - experiment_specifications: List[ExperimentSpecification], + datasets: list[Dataset], + experiment_specifications: list[ExperimentSpecification], args: CensusBuildArgs, -) -> Dict[str, EbInfo]: - """ " - Validate axis dataframes: schema, shape, contents +) -> dict[str, EbInfo]: + """Validate axis dataframes: schema, shape, contents. - Raises on error. Returns True on success. + Raises on error. Returns True on success. """ logger.debug("validate_axis_dataframes") with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census: @@ -303,15 +299,14 @@ def validate_axis_dataframes( def _validate_X_obs_axis_stats( eb: ExperimentSpecification, dataset: Dataset, census_obs: pd.DataFrame, expected_X: sparse.spmatrix ) -> bool: - """ - Helper function for _validate_X_layers_contents_by_dataset + """Helper function for _validate_X_layers_contents_by_dataset. Checks that the computed X stats, as stored in obs and var, are correct. """ TypeVar("T", bound=npt.NBitBase) - def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int = 1) -> Any: # cough, cough - """Helper: variance over sparse matrices""" + def var(X: sparse.csc_matrix | sparse.csr_matrix, axis: int = 0, ddof: int = 1) -> Any: # cough, cough + """Helper: variance over sparse matrices.""" if isinstance(X, np.ndarray): return np.var(X, axis=axis, ddof=ddof) @@ -328,7 +323,7 @@ def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int return v # various datasets have explicit zeros, which are not stored in the Census - if isinstance(expected_X, (sparse.sparray, sparse.spmatrix)): + if isinstance(expected_X, (sparse.sparray, sparse.spmatrix)): # noqa: UP038 expected_X.eliminate_zeros() # obs.raw_sum @@ -360,9 +355,8 @@ def var(X: Union[sparse.csc_matrix, sparse.csr_matrix], axis: int = 0, ddof: int return True -def _validate_Xraw_contents_by_dataset(args: Tuple[str, str, Dataset, List[ExperimentSpecification]]) -> bool: - """ - Validate that a single dataset is correctly represented in the census. Intended to be +def _validate_Xraw_contents_by_dataset(args: tuple[str, str, Dataset, list[ExperimentSpecification]]) -> bool: + """Validate that a single dataset is correctly represented in the census. Intended to be dispatched from validate_X_layers. Currently, implements the following tests: @@ -508,8 +502,8 @@ def _validate_Xraw_contents_by_dataset(args: Tuple[str, str, Dataset, List[Exper return True -def _validate_X_layer_has_unique_coords(args: Tuple[ExperimentSpecification, str, str, int, int]) -> bool: - """Validate that all X layers have no duplicate coordinates""" +def _validate_X_layer_has_unique_coords(args: tuple[ExperimentSpecification, str, str, int, int]) -> bool: + """Validate that all X layers have no duplicate coordinates.""" experiment_specification, soma_path, layer_name, row_range_start, row_range_stop = args with open_experiment(soma_path, experiment_specification) as exp: logger.info( @@ -542,8 +536,8 @@ def _validate_X_layer_has_unique_coords(args: Tuple[ExperimentSpecification, str return True -def _validate_Xnorm_layer(args: Tuple[ExperimentSpecification, str, int, int]) -> bool: - """Validate that X['normalized'] is correct relative to X['raw']""" +def _validate_Xnorm_layer(args: tuple[ExperimentSpecification, str, int, int]) -> bool: + """Validate that X['normalized'] is correct relative to X['raw'].""" experiment_specification, soma_path, row_range_start, row_range_stop = args logger.info( f"validate_Xnorm_layer - start, {experiment_specification.name}, rows [{row_range_start}, {row_range_stop})" @@ -639,15 +633,14 @@ def _validate_Xnorm_layer(args: Tuple[ExperimentSpecification, str, int, int]) - def validate_X_layers( assets_path: str, soma_path: str, - datasets: List[Dataset], - experiment_specifications: List[ExperimentSpecification], - eb_info: Dict[str, EbInfo], + datasets: list[Dataset], + experiment_specifications: list[ExperimentSpecification], + eb_info: dict[str, EbInfo], args: CensusBuildArgs, ) -> bool: - """ " - Validate all X layers: schema, shape, contents + """Validate all X layers: schema, shape, contents. - Raises on error. Returns True on success. + Raises on error. Returns True on success. """ logger.info("validate_X_layers start") avg_row_nnz = 0 @@ -736,7 +729,7 @@ def validate_X_layers( return True -def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset]: +def load_datasets_from_census(assets_path: str, soma_path: str) -> list[Dataset]: # Datasets are pulled from the census datasets manifest, validating the SOMA # census against the snapshot assets. with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census: @@ -749,7 +742,7 @@ def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset] return datasets -def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> bool: +def validate_manifest_contents(assets_path: str, datasets: list[Dataset]) -> bool: """Confirm contents of manifest are correct.""" for d in datasets: p = pathlib.Path(urlcat(assets_path, d.dataset_h5ad_path)) @@ -760,7 +753,7 @@ def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> boo def validate_consolidation(soma_path: str) -> bool: - """Verify that obs, var and X layers are all fully consolidated & vacuumed""" + """Verify that obs, var and X layers are all fully consolidated & vacuumed.""" def is_empty_tiledb_array(uri: str) -> bool: with tiledb.open(uri) as A: @@ -779,7 +772,7 @@ def is_empty_tiledb_array(uri: str) -> bool: def validate_directory_structure(soma_path: str, assets_path: str) -> bool: - """Verify that the entire census is a single directory tree""" + """Verify that the entire census is a single directory tree.""" assert soma_path.startswith(assets_path.rsplit("/", maxsplit=1)[0]) assert os.path.exists(soma_path), f"Unable to find SOMA path, expecting {soma_path}" assert os.path.exists(assets_path), f"Unable to find assets path, expecting {assets_path}" @@ -788,8 +781,8 @@ def validate_directory_structure(soma_path: str, assets_path: str) -> bool: def validate_relative_path(soma_path: str) -> bool: - """ - Verify the census objects are stored in the same relative path + """Verify the census objects are stored in the same relative path. + :param soma_path: :return: """ @@ -810,11 +803,9 @@ def _walk_tree(name: str, parent: Any) -> None: def validate_internal_consistency( - soma_path: str, experiment_specifications: List[ExperimentSpecification], datasets: List[Dataset] + soma_path: str, experiment_specifications: list[ExperimentSpecification], datasets: list[Dataset] ) -> bool: - """ - Internal checks that various computed stats match. - """ + """Internal checks that various computed stats match.""" logger.info("validate_internal_consistency - cross-checks start") datasets_df: pd.DataFrame = Dataset.to_dataframe(datasets).set_index("soma_joinid") @@ -885,17 +876,16 @@ def validate_internal_consistency( def validate_soma_bounding_box( - soma_path: str, experiment_specifications: List[ExperimentSpecification], eb_info: Dict[str, EbInfo] + soma_path: str, experiment_specifications: list[ExperimentSpecification], eb_info: dict[str, EbInfo] ) -> bool: - """ - Verify that single-cell-data/TileDB-SOMA#1969 is not affecting our results. + """Verify that single-cell-data/TileDB-SOMA#1969 is not affecting our results. Verification is: * shape is set correctly * no sparse arrays contain the bounding box in metadata """ - def get_sparse_arrays(C: soma.Collection) -> List[soma.SparseNDArray]: + def get_sparse_arrays(C: soma.Collection) -> list[soma.SparseNDArray]: uris = [] for soma_obj in C.values(): type = soma_obj.soma_type @@ -935,8 +925,7 @@ def get_sparse_arrays(C: soma.Collection) -> List[soma.SparseNDArray]: def validate(args: CensusBuildArgs) -> bool: - """ - Validate that the "census" matches the datasets and experiment builder spec. + """Validate that the "census" matches the datasets and experiment builder spec. Will raise if validation fails. Returns True on success. """ diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_state.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_state.py index a475cdda0..9b8187b06 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_state.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_state.py @@ -1,19 +1,16 @@ -""" -Manage the configuration and dynamic build state for the Census build. - -""" +"""Manage the configuration and dynamic build state for the Census build.""" import functools import io import os import pathlib +from collections.abc import Iterator, Mapping from datetime import datetime -from typing import Any, Iterator, Mapping, Union +from typing import Any, Self import psutil import yaml from attrs import define, field, fields, validators -from typing_extensions import Self """ Defaults for Census configuration. @@ -89,8 +86,8 @@ class CensusBuildConfig: test_first_n: int = field(converter=int, default=0) @classmethod - def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self: - if isinstance(file, (str, os.PathLike)): + def load(cls, file: str | os.PathLike[str] | io.TextIOBase) -> Self: + if isinstance(file, str | os.PathLike): with open(file) as f: user_config = yaml.safe_load(f) else: @@ -117,7 +114,7 @@ def load_from_env_vars(cls) -> Self: class Namespace(Mapping[str, Any]): - """Readonly namespace""" + """Readonly namespace.""" def __init__(self, **kwargs: Any): self._state = dict(kwargs) @@ -154,7 +151,7 @@ def __setstate__(self, state: dict[str, Any]) -> None: class MutableNamespace(Namespace): - """Mutable namespace""" + """Mutable namespace.""" def __setitem__(self, key: str, value: Any) -> None: if not isinstance(key, str): @@ -177,8 +174,8 @@ def __setitem__(self, key: str, value: Any) -> None: self.__dirty_keys.add(key) @classmethod - def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self: - if isinstance(file, (str, os.PathLike)): + def load(cls, file: str | os.PathLike[str] | io.TextIOBase) -> Self: + if isinstance(file, str | os.PathLike): with open(file) as state_log: documents = list(yaml.safe_load_all(state_log)) else: @@ -188,7 +185,7 @@ def load(cls, file: Union[str, os.PathLike[str], io.TextIOBase]) -> Self: state.__dirty_keys.clear() return state - def commit(self, file: Union[str, os.PathLike[str]]) -> None: + def commit(self, file: str | os.PathLike[str]) -> None: # append dirty elements (atomic on Posix) if self.__dirty_keys: dirty = {k: self[k] for k in self.__dirty_keys} @@ -203,7 +200,8 @@ class CensusBuildArgs: working_dir: pathlib.PosixPath = field(validator=validators.instance_of(pathlib.PosixPath)) config: CensusBuildConfig = field(validator=validators.instance_of(CensusBuildConfig)) state: CensusBuildState = field( - factory=CensusBuildState, validator=validators.instance_of(CensusBuildState) # default: empty state + factory=CensusBuildState, + validator=validators.instance_of(CensusBuildState), # default: empty state ) @property diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/census_summary.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/census_summary.py index 0783a1587..605b66fe9 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/census_summary.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/census_summary.py @@ -1,6 +1,6 @@ import argparse import sys -from typing import Optional, TextIO +from typing import TextIO import cellxgene_census import pandas as pd @@ -16,9 +16,9 @@ def display_summary( *, - census_version: Optional[str] = "latest", - uri: Optional[str] = None, - file: Optional[TextIO] = None, + census_version: str | None = "latest", + uri: str | None = None, + file: TextIO | None = None, ) -> int: census = cellxgene_census.open_soma(census_version=census_version, uri=uri) @@ -50,11 +50,11 @@ def display_summary( def display_diff( - census_version: Optional[str] = "latest", - uri: Optional[str] = None, - previous_census_version: Optional[str] = None, - previous_uri: Optional[str] = None, - file: Optional[TextIO] = None, + census_version: str | None = "latest", + uri: str | None = None, + previous_census_version: str | None = None, + previous_uri: str | None = None, + file: TextIO | None = None, ) -> int: census = cellxgene_census.open_soma(census_version=census_version, uri=uri) previous_census = cellxgene_census.open_soma(census_version=previous_census_version, uri=previous_uri) diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/data_copy.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/data_copy.py index f75c5cab3..896abbfb9 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/data_copy.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/data_copy.py @@ -3,14 +3,13 @@ import pathlib import subprocess import sys -from typing import Union from .logging import logging_init_params logger = logging.getLogger(__name__) -def sync_to_S3(from_path: Union[str, pathlib.PosixPath], to_path: str, dryrun: bool = False) -> None: +def sync_to_S3(from_path: str | pathlib.PosixPath, to_path: str, dryrun: bool = False) -> None: """Copy (sync) local directory to S3. Equivalent of `aws s3 sync local_directory_path S3_path`. diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/host_validation.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/host_validation.py index 8f817be91..143d1ee95 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/host_validation.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/host_validation.py @@ -1,7 +1,6 @@ import logging import os import sys -from typing import Union import psutil @@ -12,24 +11,21 @@ def _check(condition: bool, message: str) -> bool: - """Like assert, but logs""" + """Like assert, but logs.""" if not condition: logger.critical(message) return condition def check_os() -> bool: - """ - Check that we run on Posix (Linux, MacOS), as we rely on + """Check that we run on Posix (Linux, MacOS), as we rely on Posix semantics for a few things. """ return _check(os.name == "posix" and psutil.POSIX, "Census builder requires Posix OS") def check_physical_memory(min_physical_memory: int) -> bool: - """ - Check for sufficient physical and virtual memory. - """ + """Check for sufficient physical and virtual memory.""" svmem = psutil.virtual_memory() logger.debug(f"Host: {hr_binary_unit(svmem.total)} memory found") return _check( @@ -39,9 +35,7 @@ def check_physical_memory(min_physical_memory: int) -> bool: def check_swap_memory(min_swap_memory: int) -> bool: - """ - Check for sufficient physical and virtual memory. - """ + """Check for sufficient physical and virtual memory.""" svswap = psutil.swap_memory() logger.debug(f"Host: {hr_binary_unit(svswap.total)} swap found") return _check( @@ -51,10 +45,8 @@ def check_swap_memory(min_swap_memory: int) -> bool: ) -def check_free_disk(working_dir: Union[str, os.PathLike[str]], min_free_disk_space: int) -> bool: - """ - Check for sufficient free disk space. - """ +def check_free_disk(working_dir: str | os.PathLike[str], min_free_disk_space: int) -> bool: + """Check for sufficient free disk space.""" working_dir_fspath = working_dir.__fspath__() if isinstance(working_dir, os.PathLike) else working_dir skdiskusage = psutil.disk_usage(working_dir_fspath) logger.debug(f"Host: {hr_decimal_unit(skdiskusage.free)} free disk space found") @@ -66,7 +58,7 @@ def check_free_disk(working_dir: Union[str, os.PathLike[str]], min_free_disk_spa def check_host(args: CensusBuildArgs) -> bool: - """Verify all host requirments. Return True if OK, False if conditions not met""" + """Verify all host requirments. Return True if OK, False if conditions not met.""" if args.config.host_validation_disable: return True diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/logging.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/logging.py index f3dc4b459..120e87b61 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/logging.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/logging.py @@ -2,15 +2,12 @@ import math import pathlib import sys -from typing import List, Optional, Tuple from .build_state import CensusBuildArgs -def logging_init_params(verbose: int, handlers: Optional[List[logging.Handler]] = None) -> None: - """ - Configure the logger defaults with explicit config params - """ +def logging_init_params(verbose: int, handlers: list[logging.Handler] | None = None) -> None: + """Configure the logger defaults with explicit config params.""" def clamp(n: int, minn: int, maxn: int) -> int: return min(max(n, minn), maxn) @@ -39,10 +36,8 @@ def get_level(v: int) -> int: def logging_init(args: CensusBuildArgs) -> None: - """ - Configure the logger from CensusBuildArgs, including extra handlers. - """ - handlers: List[logging.Handler] = [logging.StreamHandler(sys.stderr)] + """Configure the logger from CensusBuildArgs, including extra handlers.""" + handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)] # Create logging directory if configured appropriately if args.config.log_dir and args.config.log_file: @@ -54,7 +49,7 @@ def logging_init(args: CensusBuildArgs) -> None: logging_init_params(args.config.verbose, handlers) -def _hr_multibyte_unit(n_bytes: int, unit_base: int, unit_size_names: Tuple[str, ...]) -> str: +def _hr_multibyte_unit(n_bytes: int, unit_base: int, unit_size_names: tuple[str, ...]) -> str: """Private. Convert number of bytes into a human-readable multi-byte unit string.""" if n_bytes == 0: return "0B" diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_cleanup.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_cleanup.py index e3e962395..7bebc7e44 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_cleanup.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_cleanup.py @@ -2,7 +2,6 @@ import logging import sys from datetime import datetime, timedelta -from typing import List import s3fs @@ -20,8 +19,7 @@ def remove_releases_older_than(days: int, census_base_url: str, dryrun: bool, s3_anon: bool = False) -> None: - """ - Remove old releases, committing the change to release.json. + """Remove old releases, committing the change to release.json. Current rules - delete releases where: * Tag is a date older than `days` in age @@ -34,7 +32,6 @@ def remove_releases_older_than(days: int, census_base_url: str, dryrun: bool, s3 Age of release is determined by the release version name, i.e., YYYY-MM-DD. The S3 object date or other information is not utilized. """ - _log_it(f"Delete releases older than {days} days old.", dryrun) # Load the release manifest @@ -78,7 +75,7 @@ def _update_release_manifest( def _perform_recursive_delete(rls_tag: CensusVersionName, uri: str, dryrun: bool) -> None: - """Will raise FileNotFound error if the path does not exist (which should never happen)""" + """Will raise FileNotFound error if the path does not exist (which should never happen).""" _log_it(f"Delete census release {rls_tag}: {uri}", dryrun) if dryrun: return @@ -86,7 +83,7 @@ def _perform_recursive_delete(rls_tag: CensusVersionName, uri: str, dryrun: bool s3.rm(uri, recursive=True) -def _find_removal_candidates(release_manifest: CensusReleaseManifest, days_older_than: int) -> List[CensusVersionName]: +def _find_removal_candidates(release_manifest: CensusReleaseManifest, days_older_than: int) -> list[CensusVersionName]: delete_before_date = datetime.now() - timedelta(days=days_older_than) # all releases which have a tag aliasing them @@ -94,7 +91,7 @@ def _find_removal_candidates(release_manifest: CensusReleaseManifest, days_older # In practice, we REQUIRE at least a `latest` tag, so this list should never be empty assert len(is_aliased) > 0 - candidates: List[CensusVersionName] = [] + candidates: list[CensusVersionName] = [] for rls_tag, rls_info in release_manifest.items(): if isinstance(rls_info, dict) and (rls_tag not in is_aliased) and not rls_info.get("do_not_delete", False): # candidate for deletion - check timestamp diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_manifest.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_manifest.py index 5a4a84201..f336a4567 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_manifest.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/release_manifest.py @@ -1,12 +1,10 @@ -""" -Tools to manage the release.json manifest file. -""" +"""Tools to manage the release.json manifest file.""" import json -from typing import Dict, Optional, Union, cast +from typing import NotRequired, cast import s3fs -from typing_extensions import NotRequired, TypedDict +from typing_extensions import TypedDict from .util import urlcat @@ -15,27 +13,25 @@ """ CensusVersionName = str # census version name, e.g., "release-99", "2022-10-01-test", etc. -CensusLocator = TypedDict( - "CensusLocator", - { - "uri": str, # resource URI - "relative_uri": str, # relative URI - "s3_region": Optional[str], # if an S3 URI, has optional region - }, -) -CensusVersionDescription = TypedDict( - "CensusVersionDescription", - { - "release_date": Optional[str], # date of release (deprecated) - "release_build": str, # date of build - "soma": CensusLocator, # SOMA objects locator - "h5ads": CensusLocator, # source H5ADs locator - "do_not_delete": Optional[bool], # if set, prevents automated deletion - "flags": NotRequired[Dict[str, bool]], # flags for the release - "retraction": NotRequired[Dict[str, str]], # if retracted, details of the retraction - }, -) -CensusReleaseManifest = Dict[CensusVersionName, Union[CensusVersionName, CensusVersionDescription]] + + +class CensusLocator(TypedDict): + uri: str + relative_uri: str + s3_region: str | None + + +class CensusVersionDescription(TypedDict): + release_date: str | None + release_build: str + soma: CensusLocator + h5ads: CensusLocator + do_not_delete: bool | None + flags: NotRequired[dict[str, bool]] + retraction: NotRequired[dict[str, str]] + + +CensusReleaseManifest = dict[CensusVersionName, CensusVersionName | CensusVersionDescription] CENSUS_AWS_REGION = "us-west-2" CENSUS_RELEASE_FILE = "release.json" @@ -47,8 +43,7 @@ def get_release_manifest(census_base_url: str, s3_anon: bool = False) -> CensusReleaseManifest: - """ - Fetch the census release manifest. + """Fetch the census release manifest. Args: census_base_url: @@ -65,9 +60,7 @@ def get_release_manifest(census_base_url: str, s3_anon: bool = False) -> CensusR def commit_release_manifest( census_base_url: str, release_manifest: CensusReleaseManifest, dryrun: bool = False ) -> None: - """ - Write a new release manifest to the Census. - """ + """Write a new release manifest to the Census.""" # Out of an abundance of caution, validate the contents validate_release_manifest(census_base_url, release_manifest) if not dryrun: @@ -166,10 +159,7 @@ def make_a_release( make_latest: bool, dryrun: bool = False, ) -> None: - """ - Make a release and optionally alias release as `latest` - """ - + """Make a release and optionally alias release as `latest`.""" manifest = get_release_manifest(census_base_url) if rls_tag in manifest: raise ValueError(f"Release version {rls_tag} is already in the release manifest") diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/util.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/util.py index 02360e1a3..462a6575e 100644 --- a/tools/cellxgene_census_builder/src/cellxgene_census_builder/util.py +++ b/tools/cellxgene_census_builder/src/cellxgene_census_builder/util.py @@ -17,9 +17,7 @@ def urljoin(base: str, url: str) -> str: - """ - like urllib.parse.urljoin, but doesn't get confused by S3:// - """ + """Like urllib.parse.urljoin, but doesn't get confused by s3://.""" p_url = urllib.parse.urlparse(url) if p_url.netloc: return url @@ -38,15 +36,15 @@ def urljoin(base: str, url: str) -> str: def urlcat(base: str, *paths: str) -> str: - """ - Concat one or more paths, separated with '/'. Similar to urllib.parse.urljoin, + """Concat one or more paths, separated with '/'. + + Similar to urllib.parse.urljoin, but doesn't get confused by S3:// and other "non-standard" protocols (treats - them as if they are same as http: or file:) + them as if they are same as http: or file:). Similar to urllib.parse.urljoin except it takes an iterator, and assumes the container_uri is a 'directory'/container, ie, ends in '/'. """ - url = base for p in paths: url = url if url.endswith("/") else url + "/" @@ -55,13 +53,11 @@ def urlcat(base: str, *paths: str) -> str: def env_var_init() -> None: - """ - Set environment variables as needed by dependencies, etc. + """Set environment variables as needed by dependencies, etc. This controls thread allocation for worker (child) processes. It is executed too late to influence __init__ time thread pool allocations for the main process. """ - # Each of these control thread-pool allocation for commonly used packages that # may be pulled into our environment, and which have import-time pool allocation. # Most do import time thread pool allocation equal to host CPU count, which can @@ -95,9 +91,7 @@ def env_var_init() -> None: def process_init(args: CensusBuildArgs) -> None: - """ - Called on every process start to configure global package/module behavior. - """ + """Called on every process start to configure global package/module behavior.""" logging_init(args) if multiprocessing.get_start_method(True) != "spawn": @@ -107,9 +101,9 @@ def process_init(args: CensusBuildArgs) -> None: class ProcessResourceGetter: - """ - Access to process resource state, primary for diagnostic/debugging purposes. Currently - provides current and high water mark for: + """Access to process resource state, primary for diagnostic/debugging purposes. + + Currently provides current and high water mark for: * thread count * mmaps * major page faults @@ -160,9 +154,9 @@ def majflt(self) -> tuple[int, int]: class SystemResourceGetter: - """ - Access to system resource state, primary for diagnostic/debugging purposes. Currently - provides current and high water mark for: + """Access to system resource state, primary for diagnostic/debugging purposes. + + Currently provides current and high water mark for: * memory total * memory available @@ -186,7 +180,7 @@ def mem_used(self) -> int: def log_process_resource_status(preface: str = "Resource use:", level: int = logging.DEBUG) -> None: - """Print current and historical max of thread and (memory) map counts""" + """Print current and historical max of thread and (memory) map counts.""" if platform.system() == "Linux": me = psutil.Process() mem_full_info = me.memory_full_info() @@ -228,10 +222,9 @@ def resource_logger_target() -> None: def cpu_count() -> int: - """ + """This function exists to always return a default of `1` when os.cpu_count returns None. + os.cpu_count() returns None if "undetermined" number of CPUs. - This function exists to always return a default of `1` when - os.cpu_count returns None. """ cpu_count = os.cpu_count() if os.cpu_count() is None: diff --git a/tools/cellxgene_census_builder/tests/anndata/conftest.py b/tools/cellxgene_census_builder/tests/anndata/conftest.py index 86a1cd86d..b0c642b14 100644 --- a/tools/cellxgene_census_builder/tests/anndata/conftest.py +++ b/tools/cellxgene_census_builder/tests/anndata/conftest.py @@ -1,7 +1,7 @@ import pathlib -from typing import List import pytest + from cellxgene_census_builder.build_soma.datasets import Dataset from cellxgene_census_builder.build_state import CensusBuildArgs @@ -9,7 +9,7 @@ @pytest.fixture -def datasets_with_mixed_feature_reference(census_build_args: CensusBuildArgs) -> List[Dataset]: +def datasets_with_mixed_feature_reference(census_build_args: CensusBuildArgs) -> list[Dataset]: census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) assets_path = census_build_args.h5ads_path.as_posix() @@ -36,7 +36,7 @@ def datasets_with_mixed_feature_reference(census_build_args: CensusBuildArgs) -> @pytest.fixture -def datasets_with_larger_raw_layer(census_build_args: CensusBuildArgs) -> List[Dataset]: +def datasets_with_larger_raw_layer(census_build_args: CensusBuildArgs) -> list[Dataset]: census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) assets_path = census_build_args.h5ads_path.as_posix() @@ -65,7 +65,7 @@ def datasets_with_larger_raw_layer(census_build_args: CensusBuildArgs) -> List[D @pytest.fixture -def datasets_with_incorrect_schema_version(census_build_args: CensusBuildArgs) -> List[Dataset]: +def datasets_with_incorrect_schema_version(census_build_args: CensusBuildArgs) -> list[Dataset]: census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) assets_path = census_build_args.h5ads_path.as_posix() diff --git a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py index 05c9d7691..a63cebc44 100644 --- a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py +++ b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py @@ -1,21 +1,21 @@ import pathlib -from typing import Any, List +from typing import Any import anndata as ad import numpy as np import numpy.typing as npt import pytest +from scipy import sparse + from cellxgene_census_builder.build_soma.anndata import AnnDataProxy, make_anndata_cell_filter, open_anndata from cellxgene_census_builder.build_soma.datasets import Dataset from cellxgene_census_builder.build_state import CensusBuildArgs -from scipy import sparse from ..conftest import ORGANISMS, get_anndata -def test_open_anndata(datasets: List[Dataset]) -> None: - """ - `open_anndata` should open the h5ads for each of the dataset in the argument, +def test_open_anndata(datasets: list[Dataset]) -> None: + """`open_anndata` should open the h5ads for each of the dataset in the argument, and yield both the dataset and the corresponding AnnData object. This test does not involve additional filtering steps. The `datasets` used here have no raw layer. @@ -31,11 +31,9 @@ def test_open_anndata(datasets: List[Dataset]) -> None: def test_open_anndata_filters_out_datasets_with_mixed_feature_reference( - datasets_with_mixed_feature_reference: List[Dataset], + datasets_with_mixed_feature_reference: list[Dataset], ) -> None: - """ - Datasets with a "mixed" feature_reference will not be included by the filter pipeline - """ + """Datasets with a "mixed" feature_reference will not be included by the filter pipeline""" ad_filter = make_anndata_cell_filter({}) result = [ad_filter(open_anndata(".", d)) for d in datasets_with_mixed_feature_reference] assert all(len(ad) == 0 for ad in result) @@ -43,11 +41,9 @@ def test_open_anndata_filters_out_datasets_with_mixed_feature_reference( def test_open_anndata_filters_out_wrong_schema_version_datasets( caplog: pytest.LogCaptureFixture, - datasets_with_incorrect_schema_version: List[Dataset], + datasets_with_incorrect_schema_version: list[Dataset], ) -> None: - """ - Datasets with a schema version different from `CXG_SCHEMA_VERSION` will not be included by `open_anndata` - """ + """Datasets with a schema version different from `CXG_SCHEMA_VERSION` will not be included by `open_anndata`""" for dataset in datasets_with_incorrect_schema_version: with pytest.raises(ValueError, match="incorrect CxG schema version"): _ = open_anndata(".", dataset) @@ -148,7 +144,7 @@ def test_AnnDataProxy_X_types(census_build_args: CensusBuildArgs, X_conv: str, X assert isinstance(adata[0:2].X, X_type) def _toarray(a: npt.NDArray[np.float32] | sparse.spmatrix) -> npt.NDArray[np.float32]: - if isinstance(a, (sparse.csc_matrix, sparse.csr_matrix)): + if isinstance(a, (sparse.csc_matrix, sparse.csr_matrix)): # noqa: UP038 return a.toarray() # type: ignore[no-any-return] else: return a diff --git a/tools/cellxgene_census_builder/tests/conftest.py b/tools/cellxgene_census_builder/tests/conftest.py index b183577ad..79fb3ac69 100644 --- a/tools/cellxgene_census_builder/tests/conftest.py +++ b/tools/cellxgene_census_builder/tests/conftest.py @@ -1,5 +1,4 @@ import pathlib -from typing import List, Optional import anndata import attrs @@ -7,13 +6,14 @@ import pandas as pd import pytest from _pytest.monkeypatch import MonkeyPatch +from scipy import sparse + from cellxgene_census_builder.build_soma.datasets import Dataset from cellxgene_census_builder.build_soma.globals import ( CENSUS_X_LAYERS_PLATFORM_CONFIG, ) from cellxgene_census_builder.build_state import CensusBuildArgs, CensusBuildConfig from cellxgene_census_builder.util import process_init -from scipy import sparse @attrs.define(frozen=True) @@ -27,9 +27,7 @@ class Organism: NUM_DATASET = 2 -def get_anndata( - organism: Organism, gene_ids: Optional[List[str]] = None, no_zero_counts: bool = False -) -> anndata.AnnData: +def get_anndata(organism: Organism, gene_ids: list[str] | None = None, no_zero_counts: bool = False) -> anndata.AnnData: gene_ids = gene_ids or GENE_IDS[0] n_cells = 4 n_genes = len(gene_ids) @@ -124,7 +122,7 @@ def census_build_args(request: pytest.FixtureRequest, tmp_path: pathlib.Path) -> @pytest.fixture -def datasets(census_build_args: CensusBuildArgs) -> List[Dataset]: +def datasets(census_build_args: CensusBuildArgs) -> list[Dataset]: census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) assets_path = census_build_args.h5ads_path.as_posix() datasets = [] diff --git a/tools/cellxgene_census_builder/tests/test_builder.py b/tools/cellxgene_census_builder/tests/test_builder.py index e98d84de2..1ea41336e 100644 --- a/tools/cellxgene_census_builder/tests/test_builder.py +++ b/tools/cellxgene_census_builder/tests/test_builder.py @@ -1,7 +1,6 @@ import os import pathlib from types import ModuleType -from typing import List from unittest.mock import patch import numpy as np @@ -10,6 +9,7 @@ import pytest import tiledb import tiledbsoma as soma + from cellxgene_census_builder.build_soma import build, validate from cellxgene_census_builder.build_soma.build_soma import build_step1_get_source_datasets from cellxgene_census_builder.build_soma.datasets import Dataset @@ -26,19 +26,19 @@ @pytest.mark.parametrize( - "census_build_args", [dict(multi_process=False, consolidate=True, build_tag="test_tag", verbose=0)], indirect=True + "census_build_args", + [{"multi_process": False, "consolidate": True, "build_tag": "test_tag", "verbose": 0}], + indirect=True, ) def test_base_builder_creation( - datasets: List[Dataset], + datasets: list[Dataset], census_build_args: CensusBuildArgs, setup: None, ) -> None: - """ - Runs the builder, queries the census and performs a set of base assertions. - """ + """Runs the builder, queries the census and performs a set of base assertions.""" with patch("cellxgene_census_builder.build_soma.build_soma.prepare_file_system"), patch( "cellxgene_census_builder.build_soma.build_soma.build_step1_get_source_datasets", return_value=datasets - ), patch("cellxgene_census_builder.build_soma.consolidate.submit_consolidate", return_value=list()), patch( + ), patch("cellxgene_census_builder.build_soma.consolidate.submit_consolidate", return_value=[]), patch( "cellxgene_census_builder.build_soma.validate_soma.validate_consolidation", return_value=True ): return_value = build(census_build_args) @@ -123,8 +123,7 @@ def test_base_builder_creation( def test_unicode_support(tmp_path: pathlib.Path) -> None: - """ - Regression test that unicode is supported correctly in tiledbsoma. + """Regression test that unicode is supported correctly in tiledbsoma. This test is not strictly necessary, but it validates the requirements that Census support unicode in DataFrame columns. """ @@ -143,7 +142,7 @@ def test_unicode_support(tmp_path: pathlib.Path) -> None: @pytest.mark.parametrize( "census_build_args", - [dict(manifest=True, verbose=2, build_tag="build_tag", multi_process=True, max_worker_processes=2)], + [{"manifest": True, "verbose": 2, "build_tag": "build_tag", "multi_process": True, "max_worker_processes": 2}], indirect=True, ) def test_build_step1_get_source_datasets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None: diff --git a/tools/cellxgene_census_builder/tests/test_manifest.py b/tools/cellxgene_census_builder/tests/test_manifest.py index 5f37c41b7..5548487fe 100644 --- a/tools/cellxgene_census_builder/tests/test_manifest.py +++ b/tools/cellxgene_census_builder/tests/test_manifest.py @@ -5,14 +5,13 @@ import fsspec import pytest + from cellxgene_census_builder.build_soma.manifest import load_manifest from cellxgene_census_builder.build_state import CensusBuildConfig def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: str, empty_blocklist: str) -> None: - """ - If specified a parameter, `load_manifest` should load the dataset manifest from such file. - """ + """If specified a parameter, `load_manifest` should load the dataset manifest from such file.""" manifest = load_manifest(manifest_csv, empty_blocklist) assert len(manifest) == 2 assert manifest[0].dataset_id == "dataset_id_1" @@ -30,9 +29,7 @@ def test_load_manifest_from_file(tmp_path: pathlib.Path, manifest_csv: str, empt def test_load_manifest_does_dedup(manifest_csv_with_duplicates: str, empty_blocklist: str) -> None: - """ - `load_manifest` should not include duplicate datasets from the manifest - """ + """`load_manifest` should not include duplicate datasets from the manifest""" manifest = load_manifest(manifest_csv_with_duplicates, empty_blocklist) assert len(manifest) == 2 @@ -57,9 +54,7 @@ def test_manifest_dataset_block(tmp_path: pathlib.Path, manifest_csv: str, empty def test_load_manifest_from_cxg(empty_blocklist: str) -> None: - """ - If no parameters are specified, `load_manifest` should load the dataset list from Discover API. - """ + """If no parameters are specified, `load_manifest` should load the dataset list from Discover API.""" with patch("cellxgene_census_builder.build_soma.manifest.fetch_json") as m: m.return_value = [ { @@ -114,9 +109,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None: def test_load_manifest_from_cxg_errors_on_datasets_with_old_schema( caplog: pytest.LogCaptureFixture, empty_blocklist: str ) -> None: - """ - `load_manifest` should exclude datasets that do not have a current schema version. - """ + """`load_manifest` should exclude datasets that do not have a current schema version.""" with patch("cellxgene_census_builder.build_soma.manifest.fetch_json") as m: m.return_value = [ { @@ -159,9 +152,7 @@ def test_load_manifest_from_cxg_errors_on_datasets_with_old_schema( def test_load_manifest_from_cxg_excludes_datasets_with_no_assets( caplog: pytest.LogCaptureFixture, empty_blocklist: str ) -> None: - """ - `load_manifest` should raise error if it finds datasets without assets - """ + """`load_manifest` should raise error if it finds datasets without assets""" with patch("cellxgene_census_builder.build_soma.manifest.fetch_json") as m: m.return_value = [ { @@ -202,13 +193,11 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets( def test_blocklist_alive_and_well() -> None: - """ - Perform three checks: + """Perform three checks: 1. Block list is specified in the default configuration 2. The file exists at the specified location 3. The file "looks like" a block list """ - config = CensusBuildConfig() assert config.dataset_id_blocklist_uri diff --git a/tools/cellxgene_census_builder/tests/test_release_cleanup.py b/tools/cellxgene_census_builder/tests/test_release_cleanup.py index bbce13821..f238905c3 100644 --- a/tools/cellxgene_census_builder/tests/test_release_cleanup.py +++ b/tools/cellxgene_census_builder/tests/test_release_cleanup.py @@ -1,8 +1,9 @@ from datetime import datetime, timedelta -from typing import Any, Dict, Type +from typing import Any from unittest import mock import pytest + from cellxgene_census_builder.release_cleanup import remove_releases_older_than from cellxgene_census_builder.release_manifest import CensusReleaseManifest, CensusVersionName @@ -96,27 +97,26 @@ def tag_days_old(days_old: int) -> str: @pytest.mark.parametrize( "release_manifest,remove_kwargs,expected_delete_tags", [ - (RELEASE_MANIFEST, dict(days=0, census_base_url=S3_URI), (TAG_10D_OLD, TAG_100D_OLD)), - (RELEASE_MANIFEST, dict(days=9, census_base_url=S3_URI), (TAG_10D_OLD, TAG_100D_OLD)), - (RELEASE_MANIFEST, dict(days=99, census_base_url=S3_URI), (TAG_100D_OLD,)), - (RELEASE_MANIFEST, dict(days=999, census_base_url=S3_URI), ()), - (RELEASE_MANIFEST, dict(days=0, census_base_url=S3_URI), (TAG_10D_OLD, TAG_100D_OLD)), + (RELEASE_MANIFEST, dict(days=0, census_base_url=S3_URI), (TAG_10D_OLD, TAG_100D_OLD)), # noqa: C408 + (RELEASE_MANIFEST, dict(days=9, census_base_url=S3_URI), (TAG_10D_OLD, TAG_100D_OLD)), # noqa: C408 + (RELEASE_MANIFEST, dict(days=99, census_base_url=S3_URI), (TAG_100D_OLD,)), # noqa: C408 + (RELEASE_MANIFEST, dict(days=999, census_base_url=S3_URI), ()), # noqa: C408 + (RELEASE_MANIFEST, dict(days=0, census_base_url=S3_URI), (TAG_10D_OLD, TAG_100D_OLD)), # noqa: C408 ( {**RELEASE_MANIFEST, "latest": TAG_10D_OLD}, - dict(days=0, census_base_url=S3_URI), + {"days": 0, "census_base_url": S3_URI}, (TAG_NOW, TAG_100D_OLD), ), - ({**RELEASE_MANIFEST, "latest": TAG_10D_OLD}, dict(days=9, census_base_url=S3_URI), (TAG_100D_OLD,)), + ({**RELEASE_MANIFEST, "latest": TAG_10D_OLD}, dict(days=9, census_base_url=S3_URI), (TAG_100D_OLD,)), # noqa: C408 ], ) def test_remove_releases_older_than( release_manifest: CensusReleaseManifest, - remove_kwargs: Dict[str, Any], + remove_kwargs: dict[str, Any], dryrun: bool, expected_delete_tags: list[CensusVersionName], ) -> None: """Test the expected happy paths.""" - expected_delete_calls = [mock.call(f"{S3_URI}{tag}/", recursive=True) for tag in expected_delete_tags] expected_new_manifest = release_manifest.copy() for tag in expected_delete_tags: @@ -159,7 +159,7 @@ def test_remove_releases_older_than( "release_manifest,remove_kwargs,expected_error", [ # base path check - (RELEASE_MANIFEST, dict(days=0, census_base_url="s3://not/the/right/path/", dryrun=False), ValueError), + (RELEASE_MANIFEST, {"days": 0, "census_base_url": "s3://not/the/right/path/", "dryrun": False}, ValueError), # check that soma/h5ads are in the same 'directory' ( { @@ -170,13 +170,13 @@ def test_remove_releases_older_than( "h5ads": {"uri": f"{S3_URI}{TAG_NOW}/h5ads/oops/"}, }, }, - dict(days=0, census_base_url=S3_URI, dryrun=False), + {"days": 0, "census_base_url": S3_URI, "dryrun": False}, ValueError, ), ], ) def test_remove_releases_older_than_sanity_checks( - release_manifest: CensusReleaseManifest, remove_kwargs: Dict[str, Any], expected_error: Type[Exception] + release_manifest: CensusReleaseManifest, remove_kwargs: dict[str, Any], expected_error: type[Exception] ) -> None: """Test the expected sanity/error checks""" with ( diff --git a/tools/cellxgene_census_builder/tests/test_release_manifest.py b/tools/cellxgene_census_builder/tests/test_release_manifest.py index c70c302dc..964e4a1db 100644 --- a/tools/cellxgene_census_builder/tests/test_release_manifest.py +++ b/tools/cellxgene_census_builder/tests/test_release_manifest.py @@ -1,8 +1,9 @@ import datetime -from typing import Type, cast +from typing import cast from unittest import mock import pytest + from cellxgene_census_builder.build_state import CensusBuildConfig from cellxgene_census_builder.release_manifest import ( CENSUS_AWS_REGION, @@ -131,7 +132,7 @@ def test_validate_release_manifest(release_manifest: CensusReleaseManifest) -> N ], ) def test_validate_release_manifest_errors( - release_manifest: CensusReleaseManifest, expected_error: Type[BaseException] + release_manifest: CensusReleaseManifest, expected_error: type[BaseException] ) -> None: with pytest.raises(expected_error): validate_release_manifest(TEST_CENSUS_BASE_URL, release_manifest, live_corpus_check=False) diff --git a/tools/cellxgene_census_builder/tests/test_schema_util.py b/tools/cellxgene_census_builder/tests/test_schema_util.py index a0372c07a..27d284e57 100644 --- a/tools/cellxgene_census_builder/tests/test_schema_util.py +++ b/tools/cellxgene_census_builder/tests/test_schema_util.py @@ -2,6 +2,7 @@ import pandas as pd import pyarrow as pa import pytest + from cellxgene_census_builder.build_soma.schema_util import FieldSpec, TableSpec @@ -21,7 +22,7 @@ def test_create_spec() -> None: assert not ts.use_arrow_dictionaries # default is False assert len(ts.fields) == len(fields) assert all(isinstance(f, FieldSpec) for f in ts.fields) - assert all((a.type == (b[1] if isinstance(b, tuple) else b.type)) for a, b in zip(ts.fields, fields)) + assert all((a.type == (b[1] if isinstance(b, tuple) else b.type)) for a, b in zip(ts.fields, fields, strict=False)) assert list(ts.field_names()) == [f[0] if isinstance(f, tuple) else f.name for f in fields] assert ts.field("soma_joinid").name == "soma_joinid" assert ts.field("d").name == "d" diff --git a/tools/cellxgene_census_builder/tests/test_source_assets.py b/tools/cellxgene_census_builder/tests/test_source_assets.py index 4d2b19c10..b02803cf2 100644 --- a/tools/cellxgene_census_builder/tests/test_source_assets.py +++ b/tools/cellxgene_census_builder/tests/test_source_assets.py @@ -7,9 +7,7 @@ def test_source_assets(tmp_path: pathlib.Path, census_build_args: CensusBuildArgs) -> None: - """ - `source_assets` should copy the datasets from their `dataset_asset_h5ad_uri` to the specified `assets_dir` - """ + """`source_assets` should copy the datasets from their `dataset_asset_h5ad_uri` to the specified `assets_dir`""" datasets = [] (tmp_path / "source").mkdir() census_build_args.h5ads_path.mkdir(parents=True, exist_ok=True) diff --git a/tools/cellxgene_census_builder/tests/test_util.py b/tools/cellxgene_census_builder/tests/test_util.py index 9f83b1469..9baf8d840 100644 --- a/tools/cellxgene_census_builder/tests/test_util.py +++ b/tools/cellxgene_census_builder/tests/test_util.py @@ -1,8 +1,9 @@ import numpy as np import pytest +from scipy.sparse import coo_matrix, csr_matrix, triu + from cellxgene_census_builder.build_soma.util import array_chunker, is_nonnegative_integral from cellxgene_census_builder.util import urlcat, urljoin -from scipy.sparse import coo_matrix, csr_matrix, triu def test_is_nonnegative_integral() -> None: diff --git a/tools/cellxgene_census_builder/tests/test_workflow_steps.py b/tools/cellxgene_census_builder/tests/test_workflow_steps.py index 9339e8669..d24659b11 100644 --- a/tools/cellxgene_census_builder/tests/test_workflow_steps.py +++ b/tools/cellxgene_census_builder/tests/test_workflow_steps.py @@ -1,8 +1,9 @@ import pathlib -from typing import Callable +from collections.abc import Callable from unittest import mock import pytest + from cellxgene_census_builder.__main__ import do_data_copy, do_log_copy, do_report_copy, do_the_release from cellxgene_census_builder.build_state import CensusBuildArgs, CensusBuildConfig from cellxgene_census_builder.release_manifest import CensusVersionDescription diff --git a/tools/census_contrib/src/census_contrib/__main__.py b/tools/census_contrib/src/census_contrib/__main__.py index 9ce9b61b5..bbd070730 100644 --- a/tools/census_contrib/src/census_contrib/__main__.py +++ b/tools/census_contrib/src/census_contrib/__main__.py @@ -5,7 +5,7 @@ import sys import traceback from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any import numpy as np import pyarrow as pa @@ -163,9 +163,8 @@ def ingest(config: Config) -> None: A.write(block.rename_columns(["soma_dim_0", "soma_dim_1", "soma_data"])) -def validate_contrib_embedding(uri: Union[str, Path], config: Config, skip_storage_version_check: bool = False) -> None: - """ - Validate embedding where embedding metadata is encoded in the array. +def validate_contrib_embedding(uri: str | Path, config: Config, skip_storage_version_check: bool = False) -> None: + """Validate embedding where embedding metadata is encoded in the array. Raises upon invalid result """ @@ -187,10 +186,10 @@ def load_qc_anndata( config: Config, embedding: Path, obs_value_filter: str, - obs_columns: List[str], + obs_columns: list[str], emb_name: str, -) -> Optional[sc.AnnData]: - """Returns None if the value filter excludes all cells""" +) -> sc.AnnData | None: + """Returns None if the value filter excludes all cells.""" if "soma_joinid" not in obs_columns: obs_columns = ["soma_joinid"] + obs_columns @@ -239,7 +238,7 @@ def create_qc_plots(config: Config, embedding: Path) -> None: sc._settings.settings.autoshow = False sc._settings.settings.figdir = (config.args.cwd / "figures").as_posix() - def make_random_palette(n_colors: int) -> List[str]: + def make_random_palette(n_colors: int) -> list[str]: rng = np.random.default_rng() colors = rng.integers(0, 0xFFFFFF, size=n_colors, dtype=np.uint32) return [f"#{c:06X}" for c in colors] @@ -270,7 +269,7 @@ def make_random_palette(n_colors: int) -> List[str]: logger.info(f"Saving UMAP plots for {k}") for color_by in color_by_columns: n_categories = len(adata.obs[color_by].astype(str).astype("category").cat.categories) - plot_color_kwargs: Dict[str, Any] = dict(color=color_by) + plot_color_kwargs: dict[str, Any] = {"color": color_by} # scanpy does a good job until category counts > 102 if n_categories > len(sc.plotting.palettes.default_102): plot_color_kwargs["palette"] = make_random_palette(n_categories) @@ -278,8 +277,8 @@ def make_random_palette(n_colors: int) -> List[str]: def inject_embedding_into_census_build(config: Config, embedding_src_path: Path) -> None: - """ - Inject an existing embedding (ingested via this tool) into its corresponding Census build. + """Inject an existing embedding (ingested via this tool) into its corresponding Census build. + Presumed workflow: * build census * create embedding(s) diff --git a/tools/census_contrib/src/census_contrib/args.py b/tools/census_contrib/src/census_contrib/args.py index 5906c0bfe..3ba44fb8d 100644 --- a/tools/census_contrib/src/census_contrib/args.py +++ b/tools/census_contrib/src/census_contrib/args.py @@ -1,7 +1,6 @@ from __future__ import annotations from pathlib import Path -from typing import Optional from tap import Tap @@ -13,9 +12,9 @@ class CommonArgs(Tap): # type: ignore[misc] verbose: int = 0 # Logging level metadata: str = "meta.yml" # Metadata file name, as .json or .yaml skip_storage_version_check: bool = False # Skip TileDB storage equivalence check - census_uri: Optional[ - str - ] = None # override Census URI. If not specified, will look up using metadata `census_version` field. + census_uri: str | None = ( + None # override Census URI. If not specified, will look up using metadata `census_version` field. + ) def configure(self) -> None: super().configure() @@ -84,11 +83,9 @@ def path_fix(self, arg_name: str) -> None: setattr(self, arg_name, self.cwd.joinpath(getattr(self, arg_name))) def process_args(self) -> None: - """ - process_args only called for classes where parse_ars is called, i.e. + """process_args only called for classes where parse_ars is called, i.e. not on sub-command classes. So do all sub-class process_arg work here. """ - # Validate cwd if not self.cwd.is_dir(): raise ValueError("Must specify working directory") diff --git a/tools/census_contrib/src/census_contrib/census_util.py b/tools/census_contrib/src/census_contrib/census_util.py index 5ca28d5cf..40528339e 100644 --- a/tools/census_contrib/src/census_contrib/census_util.py +++ b/tools/census_contrib/src/census_contrib/census_util.py @@ -1,7 +1,7 @@ from __future__ import annotations import functools -from typing import Optional, Tuple, cast +from typing import cast import cellxgene_census import numpy as np @@ -14,9 +14,8 @@ logger = get_logger() -def open_census(census_version: Optional[str], census_uri: Optional[str]) -> soma.Collection: +def open_census(census_version: str | None, census_uri: str | None) -> soma.Collection: """Open and return the Census top-level handle.""" - if census_uri: return cellxgene_census.open_soma(uri=census_uri) @@ -24,9 +23,8 @@ def open_census(census_version: Optional[str], census_uri: Optional[str]) -> som @functools.cache -def get_obs_soma_joinids(config: Config) -> Tuple[npt.NDArray[np.int64], Tuple[int, ...]]: - """ - Return experiment obs soma_joind values and obs shape appropriate for the +def get_obs_soma_joinids(config: Config) -> tuple[npt.NDArray[np.int64], tuple[int, ...]]: + """Return experiment obs soma_joind values and obs shape appropriate for the Census version specified in the metadata. """ with open_census(census_uri=config.args.census_uri, census_version=config.metadata.census_version) as census: @@ -37,7 +35,7 @@ def get_obs_soma_joinids(config: Config) -> Tuple[npt.NDArray[np.int64], Tuple[i return joinids, (joinids.max() + 1,) -def get_census_obs_uri_region(config: Config) -> Tuple[str, str]: +def get_census_obs_uri_region(config: Config) -> tuple[str, str]: with open_census(census_uri=config.args.census_uri, census_version=config.metadata.census_version) as census: exp = census["census_data"][config.metadata.experiment_name] uri = exp.obs.uri diff --git a/tools/census_contrib/src/census_contrib/config.py b/tools/census_contrib/src/census_contrib/config.py index 829b3e8af..03d16de69 100644 --- a/tools/census_contrib/src/census_contrib/config.py +++ b/tools/census_contrib/src/census_contrib/config.py @@ -11,5 +11,5 @@ @attrs.define(kw_only=True, frozen=True) class Config: - args: "Arguments" - metadata: "EmbeddingMetadata" + args: Arguments + metadata: EmbeddingMetadata diff --git a/tools/census_contrib/src/census_contrib/load.py b/tools/census_contrib/src/census_contrib/load.py index 28a030e94..55c655bf7 100644 --- a/tools/census_contrib/src/census_contrib/load.py +++ b/tools/census_contrib/src/census_contrib/load.py @@ -1,15 +1,16 @@ from __future__ import annotations from abc import ABCMeta, abstractproperty +from collections.abc import Iterator from contextlib import AbstractContextManager from pathlib import Path -from typing import Any, Dict, Iterator, Literal, Tuple, Union +from typing import Any, Literal import numpy as np import numpy.typing as npt import pyarrow as pa import tiledbsoma as soma -from typing_extensions import Self +from typing_extensions import Self # noqa from .census_util import get_obs_soma_joinids from .config import Config @@ -19,12 +20,11 @@ EmbeddingTableIterator = Iterator[pa.Table] -EmbeddingIJDDomains = Dict[Literal["i", "j", "d"], Union[Tuple[float, float], Tuple[None, None]]] +EmbeddingIJDDomains = dict[Literal["i", "j", "d"], tuple[float, float] | tuple[None, None]] class EmbeddingIJDPipe(EmbeddingTableIterator, AbstractContextManager["EmbeddingIJDPipe"], metaclass=ABCMeta): - """ - Returns pa.Table with i, j, and d columns (i.e., COO), in row-major/C sorted order. + """Returns pa.Table with i, j, and d columns (i.e., COO), in row-major/C sorted order. Must not have dups. """ @@ -37,7 +37,7 @@ def type(self) -> pa.DataType: @abstractproperty def domains(self) -> EmbeddingIJDDomains: - """Return domains of i, j, and d""" + """Return domains of i, j, and d.""" class SOMAIJDPipe(EmbeddingIJDPipe): @@ -80,7 +80,7 @@ def type(self) -> pa.DataType: @property def domains(self) -> EmbeddingIJDDomains: - """Return the domains of i, j and d""" + """Return the domains of i, j and d.""" logger.debug("SOMAIJDPipe - scanning for domains") _domains: EmbeddingIJDDomains = {"i": (None, None), "j": (None, None), "d": (None, None)} @@ -108,7 +108,8 @@ def accum_min_max(tbl: pa.Table, col_name: str, col_alias: Literal["i", "j", "d" class NPYIJDPipe(EmbeddingIJDPipe): - """ + """NPYIJDPipe. + Basic approach: 1. load joinid 1d array as npy or txt 2. argsort joinid array as there is no requirement it is 0..n @@ -167,7 +168,7 @@ def type(self) -> pa.DataType: @property def domains(self) -> EmbeddingIJDDomains: - """Return the domains of i, j and d""" + """Return the domains of i, j and d.""" logger.debug("NPYIJDPipe - scanning for domains") min_max = pa.compute.min_max(pa.array(self.embeddings.ravel())) diff --git a/tools/census_contrib/src/census_contrib/metadata.py b/tools/census_contrib/src/census_contrib/metadata.py index 82d1d283a..e5b04d779 100644 --- a/tools/census_contrib/src/census_contrib/metadata.py +++ b/tools/census_contrib/src/census_contrib/metadata.py @@ -2,7 +2,7 @@ import datetime import pathlib -from typing import Any, Dict, Optional, Tuple, Union, cast +from typing import Any, cast import attrs import cattrs @@ -11,7 +11,7 @@ import cellxgene_census import requests from attrs import field, validators -from typing_extensions import Self +from typing_extensions import Self # noqa from .args import Arguments from .census_util import open_census @@ -20,7 +20,7 @@ logger = get_logger() -def none_or_str(v: Optional[str]) -> str: +def none_or_str(v: str | None) -> str: return "" if v is None else v @@ -37,7 +37,7 @@ class EmbeddingMetadata: title: str = field(validator=validators.instance_of(str)) description: str = field(validator=validators.instance_of(str)) primary_contact: Contact = field(validator=validators.instance_of(Contact)) - additional_contacts: Tuple[Contact, ...] = field( + additional_contacts: tuple[Contact, ...] = field( factory=tuple, validator=validators.deep_iterable( validators.instance_of(Contact), @@ -56,7 +56,7 @@ class EmbeddingMetadata: submission_date: datetime.date = field(validator=validators.instance_of(datetime.date)) @classmethod - def from_dict(cls, md: Dict[str, Any]) -> Self: + def from_dict(cls, md: dict[str, Any]) -> Self: return cast(Self, cattrs.structure_attrs_fromdict(md, cls)) @classmethod @@ -75,8 +75,8 @@ def from_json(cls, data: str) -> Self: cattrs.preconf.json.make_converter(forbid_extra_keys=True, prefer_attrib_converters=True).loads(data, cls), ) - def to_dict(self) -> Dict[str, Any]: - return cast(Dict[str, Any], cattrs.unstructure(self)) + def to_dict(self) -> dict[str, Any]: + return cast(dict[str, Any], cattrs.unstructure(self)) def to_json(self) -> str: return cast(str, cattrs.preconf.json.make_converter().dumps(self)) @@ -85,7 +85,7 @@ def to_yaml(self) -> str: return cast(str, cattrs.preconf.pyyaml.make_converter().dumps(self)) -def load_metadata(path: Union[str, pathlib.Path]) -> EmbeddingMetadata: +def load_metadata(path: str | pathlib.Path) -> EmbeddingMetadata: metadata_path = pathlib.PosixPath(path) if not metadata_path.is_file(): raise ValueError("--metadata: file does not exist") @@ -109,7 +109,8 @@ def load_metadata(path: Union[str, pathlib.Path]) -> EmbeddingMetadata: def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> EmbeddingMetadata: - """ + """Validate the metadata. + Checks to perform on metadata: 1. Census version must be an LTS version (implies existence) 2. Census version, experiment and measurement must exist @@ -118,7 +119,6 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding 5. Title must have length < 128 characters 6. Description must have length < 2048 characters """ - if not metadata.id: raise ValueError("metadata is missing 'id' (accession)") @@ -144,8 +144,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding def validate_census_info(args: Arguments, metadata: EmbeddingMetadata) -> None: - """Errors / exists upon failure""" - + """Errors / exists upon failure.""" if not args.census_uri: # ie. if no override of census releases = cellxgene_census.get_census_version_directory() @@ -171,8 +170,7 @@ def validate_census_info(args: Arguments, metadata: EmbeddingMetadata) -> None: def validate_doi(metadata: EmbeddingMetadata) -> None: - """Errors / exists upon failure""" - + """Errors / exists upon failure.""" # 3. DOI must validate if specified if not metadata.DOI: return @@ -188,8 +186,7 @@ def validate_doi(metadata: EmbeddingMetadata) -> None: def validate_urls(metadata: EmbeddingMetadata) -> None: - """Errors / exits upon failure""" - + """Errors / exits upon failure.""" # 4. All supplied URLs must resolve for fld_name, url in [(f, getattr(metadata, f, "")) for f in ("model_link",)]: if url: diff --git a/tools/census_contrib/src/census_contrib/save.py b/tools/census_contrib/src/census_contrib/save.py index cecaa111f..ccf7730d6 100644 --- a/tools/census_contrib/src/census_contrib/save.py +++ b/tools/census_contrib/src/census_contrib/save.py @@ -2,7 +2,6 @@ import copy from pathlib import Path -from typing import Optional, Tuple, Union import numpy as np import numpy.typing as npt @@ -51,7 +50,7 @@ } -def make_platform_config(shape: Tuple[int, int], value_range: Tuple[float, float]) -> PlatformConfig: +def make_platform_config(shape: tuple[int, int], value_range: tuple[float, float]) -> PlatformConfig: platform_config = copy.deepcopy(PLATFORM_CONFIG_TEMPLATE) tdb_schema = platform_config["tiledb"]["create"] tdb_schema["dims"]["soma_dim_1"]["tile"] = shape[1] @@ -59,10 +58,10 @@ def make_platform_config(shape: Tuple[int, int], value_range: Tuple[float, float def create_obsm_like_array( - uri: Union[str, Path], - value_range: Tuple[float, float], # closed, i.e., inclusive [min, max] - shape: Tuple[int, int], - context: Optional[soma.options.SOMATileDBContext] = None, + uri: str | Path, + value_range: tuple[float, float], # closed, i.e., inclusive [min, max] + shape: tuple[int, int], + context: soma.options.SOMATileDBContext | None = None, ) -> soma.SparseNDArray: """Create and return opened array. Can be used as a context manager.""" array_path: str = Path(uri).as_posix() @@ -84,8 +83,7 @@ def reduce_float_precision(tbl: pa.Table, sig_bits: int = 7) -> pa.Table: def roundHalfToEven(a: npt.NDArray[np.float32], keepbits: int) -> npt.NDArray[np.float32]: - """ - Generate reduced precision floating point array, with round half to even. + """Generate reduced precision floating point array, with round half to even. IMPORANT: In-place operation. @@ -107,7 +105,7 @@ def roundHalfToEven(a: npt.NDArray[np.float32], keepbits: int) -> npt.NDArray[np return a -def _consolidate_tiledb_object(uri: Union[str, Path], modes: Tuple[str, ...]) -> None: +def _consolidate_tiledb_object(uri: str | Path, modes: tuple[str, ...]) -> None: import tiledb path: str = Path(uri).as_posix() @@ -135,11 +133,11 @@ def _consolidate_tiledb_object(uri: Union[str, Path], modes: Tuple[str, ...]) -> logger.info(f"Consolidate/vacuum: end uri={path}") -def consolidate_array(uri: Union[str, Path]) -> None: +def consolidate_array(uri: str | Path) -> None: _consolidate_tiledb_object(uri, ("fragment_meta", "array_meta", "fragments", "commits")) -def consolidate_group(uri: Union[str, Path]) -> None: +def consolidate_group(uri: str | Path) -> None: # TODO: There is a bug in TileDB-Py that prevents consolidation of # group metadata. Skipping this step for now - remove this work-around # when the bug is fixed. As of 0.23.0, it is not yet fixed. diff --git a/tools/census_contrib/src/census_contrib/util.py b/tools/census_contrib/src/census_contrib/util.py index 8582ac31d..d561de20c 100644 --- a/tools/census_contrib/src/census_contrib/util.py +++ b/tools/census_contrib/src/census_contrib/util.py @@ -5,9 +5,10 @@ import math import pathlib import urllib +from collections.abc import Generator, Iterator, Sequence from concurrent.futures import Future, ThreadPoolExecutor from importlib.metadata import metadata -from typing import Any, Dict, Generator, Iterator, Optional, Sequence, Tuple, TypeVar, Union, cast +from typing import Any, TypeVar, cast import numpy as np import numpy.typing as npt @@ -20,8 +21,7 @@ @functools.cache def has_blockwise_iterator() -> bool: - """ - Feature flag. Return true if the tiledbsoma SparseNDArray contains the blockwise iterator. + """Feature flag. Return true if the tiledbsoma SparseNDArray contains the blockwise iterator. Introduced in version 1.5. """ return cast(bool, Version(metadata("tiledbsoma")["Version"]) >= Version("1.5.0")) @@ -36,14 +36,12 @@ def get_logger() -> logging.Logger: def blocksize(n_features: int, nnz_goal: int = MAX_NNZ_GOAL) -> int: - """ - Given an nnz goal, and n_features, return step size for a blockwise iterator. - """ + """Given an nnz goal, and n_features, return step size for a blockwise iterator.""" nnz_goal = max(nnz_goal, MAX_NNZ_GOAL) return cast(int, 2 ** round(math.log2((nnz_goal) / n_features))) -def soma_context(tiledb_config: Optional[Dict[str, Any]] = None) -> soma.options.SOMATileDBContext: +def soma_context(tiledb_config: dict[str, Any] | None = None) -> soma.options.SOMATileDBContext: """Return soma context with default config.""" tiledb_config = tiledb_config or {} return soma.options.SOMATileDBContext().replace( @@ -67,13 +65,13 @@ class EagerIterator(Iterator[_T]): def __init__( self, iterator: Iterator[_T], - pool: Optional[ThreadPoolExecutor] = None, + pool: ThreadPoolExecutor | None = None, ): super().__init__() self.iterator = iterator self._pool = pool or ThreadPoolExecutor() self._own_pool = pool is None - self._future: Optional[Future[_T]] = None + self._future: Future[_T] | None = None self._fetch_next() def _fetch_next(self) -> None: @@ -113,9 +111,9 @@ def blockwise_axis0_tables( A: soma.SparseNDArray, coords: soma.options.SparseNDCoords = (), result_order: soma.options.ResultOrderStr = soma.ResultOrder.AUTO, - size: Optional[Union[int, Sequence[int]]] = None, - reindex_disable_on_axis: Optional[Union[int, Sequence[int]]] = None, -) -> Generator[Tuple[pa.Table, Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]], None, None]: + size: int | Sequence[int] | None = None, + reindex_disable_on_axis: int | Sequence[int] | None = None, +) -> Generator[tuple[pa.Table, tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]], None, None]: assert A.ndim == 2 coords, size, reindex_disable_on_axis = _validate_args(A.shape, coords, size, reindex_disable_on_axis) minor_joinids = pa.array(np.concatenate(list(_coords_strider(coords[1], A.shape[1], A.shape[1])))) @@ -144,9 +142,9 @@ def blockwise_axis0_scipy_csr( A: soma.SparseNDArray, coords: soma.options.SparseNDCoords = (), result_order: soma.options.ResultOrderStr = soma.ResultOrder.AUTO, - size: Optional[Union[int, Sequence[int]]] = None, - reindex_disable_on_axis: Optional[Union[int, Sequence[int]]] = None, -) -> Generator[Tuple[sp.csr_matrix, Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]], None, None]: + size: int | Sequence[int] | None = None, + reindex_disable_on_axis: int | Sequence[int] | None = None, +) -> Generator[tuple[sp.csr_matrix, tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]], None, None]: assert A.ndim == 2 coords, size, reindex_disable_on_axis = _validate_args(A.shape, coords, size, reindex_disable_on_axis) @@ -170,17 +168,17 @@ def blockwise_axis0_scipy_csr( _ElemT = TypeVar("_ElemT") -def _pad_with_none(s: Sequence[_ElemT], to_length: int) -> Tuple[Optional[_ElemT], ...]: - """Given a sequence, pad length to a user-specified length, with None values""" +def _pad_with_none(s: Sequence[_ElemT], to_length: int) -> tuple[_ElemT | None, ...]: + """Given a sequence, pad length to a user-specified length, with None values.""" return tuple(s[i] if i < len(s) else None for i in range(to_length)) def _validate_args( - shape: Tuple[int, ...], + shape: tuple[int, ...], coords: soma.options.SparseNDCoords, - size: Optional[Union[int, Sequence[int]]] = None, - reindex_disable_on_axis: Optional[Union[int, Sequence[int]]] = None, -) -> Tuple[Tuple[Any, Any], Sequence[int], Sequence[int]]: + size: int | Sequence[int] | None = None, + reindex_disable_on_axis: int | Sequence[int] | None = None, +) -> tuple[tuple[Any, Any], Sequence[int], Sequence[int]]: ndim = len(shape) axis = [0] @@ -209,15 +207,13 @@ def _validate_args( def _coords_strider(coords: soma.options.SparseNDCoord, length: int, stride: int) -> Iterator[npt.NDArray[np.int64]]: - """ - Private. + """Private. Iterate over major coordinates, in stride sized steps, materializing each step as an ndarray of coordinate values. Will be sorted in ascending order. NB: SOMA slices are _closed_ (i.e., inclusive of both range start and stop) """ - # normalize coord to either a slice or ndarray # NB: type check on slice is to handle the case where coords is an NDArray, @@ -228,9 +224,9 @@ def _coords_strider(coords: soma.options.SparseNDCoord, length: int, stride: int coords = np.array([coords], dtype=np.int64) elif isinstance(coords, Sequence): coords = np.array(coords).astype(np.int64) - elif isinstance(coords, (pa.Array, pa.ChunkedArray)): + elif isinstance(coords, (pa.Array, pa.ChunkedArray)): # noqa: UP038 coords = coords.to_numpy() - elif not isinstance(coords, (np.ndarray, slice)): + elif not isinstance(coords, (np.ndarray, slice)): # noqa: UP038 raise TypeError("Unsupported slice coordinate type") if isinstance(coords, slice): diff --git a/tools/census_contrib/src/census_contrib/validate.py b/tools/census_contrib/src/census_contrib/validate.py index bd3fc465e..b831056a9 100644 --- a/tools/census_contrib/src/census_contrib/validate.py +++ b/tools/census_contrib/src/census_contrib/validate.py @@ -1,11 +1,10 @@ -""" -Validate an embedding -""" +"""Validate an embedding.""" from __future__ import annotations import concurrent.futures -from typing import Any, Generator, Tuple, TypeVar, Union, cast +from collections.abc import Generator +from typing import Any, TypeVar, cast import numba as nb import numpy as np @@ -37,7 +36,7 @@ def validate_compatible_tiledb_storage_format(uri: str, config: Config) -> None: - """Verify Census build and Embedding TileDB formats are identical""" + """Verify Census build and Embedding TileDB formats are identical.""" import tiledb # Fetch embedding storage version @@ -52,9 +51,7 @@ def validate_compatible_tiledb_storage_format(uri: str, config: Config) -> None: def validate_embedding(config: Config, uri: str) -> None: - """ - Validate an embedding saved as a SOMASparseNDArray, e.g., obsm-like. Raises on invalid - """ + """Validate an embedding saved as a SOMASparseNDArray, e.g., obsm-like. Raises on invalid.""" logger.info(f"Validating {uri}") metadata = config.metadata obs_joinids, _ = get_obs_soma_joinids(config) @@ -114,7 +111,7 @@ def validate_embedding(config: Config, uri: str) -> None: ) -def _validate_shape(shape: Tuple[int, ...], config: Config) -> None: +def _validate_shape(shape: tuple[int, ...], config: Config) -> None: _, obs_shape = get_obs_soma_joinids(config) if len(shape) != 2: @@ -131,8 +128,7 @@ def _validate_shape(shape: Tuple[int, ...], config: Config) -> None: @nb.njit() # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 def _isin_all(elmts: _NPT, test_elmts: _NPT) -> bool: - """ - Return equivalent of numpy.isin(elmts, test_elmts).all() without the + """Return equivalent of numpy.isin(elmts, test_elmts).all() without the memory allocation and extra reduction required by the numpy expression. """ test = set(test_elmts) @@ -142,7 +138,7 @@ def _isin_all(elmts: _NPT, test_elmts: _NPT) -> bool: return True -def isin_all(elmts: Union[pa.ChunkedArray, pa.Array, _NPT], test_elmts: _NPT) -> bool: +def isin_all(elmts: pa.ChunkedArray | pa.Array | _NPT, test_elmts: _NPT) -> bool: if isinstance(elmts, pa.ChunkedArray): return all(_isin_all(chunk.to_numpy(), test_elmts) for chunk in elmts.iterchunks()) elif isinstance(elmts, pa.Array): @@ -153,8 +149,7 @@ def isin_all(elmts: Union[pa.ChunkedArray, pa.Array, _NPT], test_elmts: _NPT) -> @nb.njit() # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 def _is_in_range_all(elmts: _NPT, min: float, max: float) -> bool: - """ - Return equivalent of np.logical_or((elmts < min), (elmts > max)).any() + """Return equivalent of np.logical_or((elmts < min), (elmts > max)).any() without the memory allocation and extra reduction required by the numpy expression. """ for i in range(len(elmts)): @@ -163,7 +158,7 @@ def _is_in_range_all(elmts: _NPT, min: float, max: float) -> bool: return True -def is_in_range_all(elmts: Union[pa.ChunkedArray, pa.Array, _NPT], min: float, max: float) -> bool: +def is_in_range_all(elmts: pa.ChunkedArray | pa.Array | _NPT, min: float, max: float) -> bool: if isinstance(elmts, pa.ChunkedArray): return all(_is_in_range_all(chunk.to_numpy(), min, max) for chunk in elmts.iterchunks()) elif isinstance(elmts, pa.Array): @@ -175,7 +170,7 @@ def is_in_range_all(elmts: Union[pa.ChunkedArray, pa.Array, _NPT], min: float, m @nb.njit() # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 def _is_sorted_unique( i: npt.NDArray[np.int64], j: npt.NDArray[np.int64], j_shape: int, last_coord: int -) -> Tuple[bool, int]: +) -> tuple[bool, int]: for n in range(len(i)): c_coord = i[n] * j_shape + j[n] if c_coord <= last_coord: @@ -185,7 +180,7 @@ def _is_sorted_unique( def is_sorted_unique(i: npt.NDArray[np.int64], j: npt.NDArray[np.int64], j_shape: int) -> bool: - ok, _ = cast(Tuple[bool, int], _is_sorted_unique(i, j, j_shape, -1)) + ok, _ = cast(tuple[bool, int], _is_sorted_unique(i, j, j_shape, -1)) return ok diff --git a/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb b/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb index 5501ae8a1..7a32a46d5 100644 --- a/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb +++ b/tools/census_contrib_qc/embeddings_qc_2023-12-15.ipynb @@ -29,17 +29,17 @@ "metadata": {}, "outputs": [], "source": [ - "from cellxgene_census.experimental import get_embedding_metadata\n", - "from cellxgene_census.experimental import get_embedding\n", + "import json\n", "import pprint\n", - "import cellxgene_census\n", - "import cellxgene_census\n", "import warnings\n", - "import json\n", + "\n", + "import cellxgene_census\n", "import scanpy\n", + "from cellxgene_census.experimental import get_embedding, get_embedding_metadata\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "import collections\n", + "\n", "import numpy as np" ] }, diff --git a/tools/models/geneformer/finetune-geneformer.py b/tools/models/geneformer/finetune-geneformer.py index db98693a7..13427706e 100644 --- a/tools/models/geneformer/finetune-geneformer.py +++ b/tools/models/geneformer/finetune-geneformer.py @@ -151,9 +151,7 @@ def make_trainer(config, model, train_dataset, test_dataset): def count_label_errors(eval): - """ - Count the label errors & error rate in the model's predictions. - """ + """Count the label errors & error rate in the model's predictions.""" pred_labels = eval.predictions.argmax(-1) true_labels = eval.label_ids assert len(pred_labels) == len(true_labels) diff --git a/tools/models/geneformer/generate-geneformer-embeddings.py b/tools/models/geneformer/generate-geneformer-embeddings.py index 2039d77c2..e3adbc589 100755 --- a/tools/models/geneformer/generate-geneformer-embeddings.py +++ b/tools/models/geneformer/generate-geneformer-embeddings.py @@ -31,7 +31,7 @@ def main(argv): aws_region = "us-west-2" try: aws_region = boto3.Session().region_name - except Exception: + except Exception: # noqa: BLE001 pass tiledbsoma_context = tiledbsoma.options.SOMATileDBContext( tiledb_ctx=tiledb.Ctx( diff --git a/tools/models/geneformer/helpers/ontology_mapper.py b/tools/models/geneformer/helpers/ontology_mapper.py index 88366ba28..d5d01eb86 100644 --- a/tools/models/geneformer/helpers/ontology_mapper.py +++ b/tools/models/geneformer/helpers/ontology_mapper.py @@ -1,6 +1,5 @@ # mypy: ignore-errors -""" -Provides classes to recreate cell type and tissue mappings as used in CELLxGENE Discover +"""Provides classes to recreate cell type and tissue mappings as used in CELLxGENE Discover. - OntologyMapper abstract class to create other mappers - SystemMapper to map any tissue to a System @@ -13,7 +12,6 @@ import os from abc import ABC, abstractmethod -from typing import List, Optional, Union import owlready2 @@ -31,8 +29,8 @@ class OntologyMapper(ABC): def __init__( self, - high_level_ontology_term_ids: List[str], - ontology_owl_path: Union[str, os.PathLike], + high_level_ontology_term_ids: list[str], + ontology_owl_path: str | os.PathLike, root_ontology_term_id: str, ): self._cached_high_level_terms = {} @@ -50,11 +48,8 @@ def __init__( except TypeError: self._ontology = owlready2.get_ontology(ontology_owl_path).load() - def get_high_level_terms(self, ontology_term_id: str) -> List[Optional[str]]: - """ - Returns the associated high-level ontology term IDs from any other ID - """ - + def get_high_level_terms(self, ontology_term_id: str) -> list[str | None]: + """Returns the associated high-level ontology term IDs from any other I.""" ontology_term_id = self.reformat_ontology_term_id(ontology_term_id, to_writable=False) if ontology_term_id in self._cached_high_level_terms: @@ -93,27 +88,24 @@ def get_high_level_terms(self, ontology_term_id: str) -> List[Optional[str]]: return resulting_high_level_terms - def get_top_high_level_term(self, ontology_term_id: str) -> Optional[str]: - """ - Return the top high level term - """ - + def get_top_high_level_term(self, ontology_term_id: str) -> str | None: + """Return the top high level term.""" return self.get_high_level_terms(ontology_term_id)[0] @abstractmethod def _get_branch_ancestors(self, owl_entity): - """ - Gets ALL ancestors from an owl entity. What's defined as an ancestor depends on the mapper type, for - example CL ancestors are likely to just include is_a relationship + """Gets ALL ancestors from an owl entity. + + What's defined as an ancestor depends on the mapper type, for + example CL ancestors are likely to just include is_a relationship. """ def get_label_from_id(self, ontology_term_id: str): - """ - Returns the label from and ontology term id that is in writable form + """Returns the label from and ontology term id that is in writable form. + Example: "UBERON:0002048" returns "lung" Example: "UBERON_0002048" raises ValueError because the ID is not in writable form """ - if ontology_term_id in self._cached_labels: return self._cached_labels[ontology_term_id] @@ -131,12 +123,12 @@ def get_label_from_id(self, ontology_term_id: str): @staticmethod def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True): - """ - Converts ontology term id string between two formats: - - `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048" - - `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048" - """ + """Reformats ontology term ID string. + Converts ontology term id string between two formats. + - `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048" + - `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048" + """ if ontology_term_id is None: return None @@ -149,9 +141,8 @@ def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True): raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'") return ontology_term_id.replace(":", "_") - def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: Optional[List[str]] = None) -> List[str]: - """ - Recursive function that given an entity of an ontology, it traverses the ontology and returns + def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: list[str] | None = None) -> list[str]: + """Recursive function that given an entity of an ontology, it traverses the ontology and returns a list of all ancestors associated with the entity. """ ancestors = ancestors or [] @@ -182,9 +173,7 @@ def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: Option return ancestors def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass: - """ - Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity - """ + """Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity.""" return self._ontology.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}") @staticmethod @@ -206,8 +195,8 @@ class CellMapper(OntologyMapper): # Only look up ancestors under Cell ROOT_NODE = "CL_0000000" - def __init__(self, cell_type_high_level_ontology_term_ids: List[str]): - super(CellMapper, self).__init__( + def __init__(self, cell_type_high_level_ontology_term_ids: list[str]): + super(CellMapper, self).__init__( # noqa: UP008 high_level_ontology_term_ids=cell_type_high_level_ontology_term_ids, ontology_owl_path=self.CXG_CL_ONTOLOGY_URL, root_ontology_term_id=self.ROOT_NODE, @@ -242,9 +231,9 @@ class TissueMapper(OntologyMapper): # Only look up ancestors under anatomical entity ROOT_NODE = "UBERON_0001062" - def __init__(self, tissue_high_level_ontology_term_ids: List[str]): + def __init__(self, tissue_high_level_ontology_term_ids: list[str]): self.cell_type_high_level_ontology_term_ids = tissue_high_level_ontology_term_ids - super(TissueMapper, self).__init__( + super(TissueMapper, self).__init__( # noqa: UP008 high_level_ontology_term_ids=tissue_high_level_ontology_term_ids, ontology_owl_path=self.CXG_UBERON_ONTOLOGY_URL, root_ontology_term_id=self.ROOT_NODE, diff --git a/tools/models/geneformer/prepare-census-geneformer-dataset.py b/tools/models/geneformer/prepare-census-geneformer-dataset.py index 070e43b2c..414777fb9 100755 --- a/tools/models/geneformer/prepare-census-geneformer-dataset.py +++ b/tools/models/geneformer/prepare-census-geneformer-dataset.py @@ -42,13 +42,13 @@ def main(argv): with GeneformerTokenizer( census_human, obs_query=tiledbsoma.AxisQuery(coords=(coords,)), - obs_attributes=list( + obs_attributes=[ # cell_subclass isn't yet in Census (select_cells() added it to obs_df for us), so # exclude from the experiment axis query it for it in args.obs_columns if it not in ("cell_subclass", "cell_subclass_ontology_term_id") - ), + ], ) as tokenizer: logger.info(f"tokenizing {len(coords)} cells...") dataset = tokenizer.build() @@ -131,8 +131,7 @@ def parse_arguments(argv): def select_cells(census_human, value_filter, percentage_data, sampling_column, N): - """ - Select the desired cells from the human census experiment. + """Select the desired cells from the human census experiment. Return a pd.DataFrame indexed by soma_joinid with additional cell_subclass and cell_subclass_ontology_term_id attributes. These aren't currently provided in obs, so we derive them on the fly. @@ -152,8 +151,7 @@ def select_cells(census_human, value_filter, percentage_data, sampling_column, N mapper = CellSubclassMapper(map_orphans_to_class=True) obs_df["cell_subclass_ontology_term_id"] = obs_df["cell_type_ontology_term_id"].map( # if CellSubclassMapper doesn't find a subclass, just use the cell type itself - lambda it: mapper.get_top_high_level_term(it) - or it + lambda it: mapper.get_top_high_level_term(it) or it ) obs_df["cell_subclass"] = obs_df["cell_subclass_ontology_term_id"].map(lambda it: mapper.get_label_from_id(it)) subclass_counts = Counter(obs_df["cell_subclass"]) diff --git a/tools/models/scvi/scvi-train.py b/tools/models/scvi/scvi-train.py index 19f031035..abfb040e9 100644 --- a/tools/models/scvi/scvi-train.py +++ b/tools/models/scvi/scvi-train.py @@ -41,7 +41,7 @@ devices = train_config.get("devices") multi_gpu = train_config.get("multi_gpu", False) - trainer_config = train_config.get("trainer") or dict() + trainer_config = train_config.get("trainer") or {} training_plan_config = config.get("training_plan") diff --git a/tools/pyproject.toml b/tools/pyproject.toml index bfa567e0e..811b3ae83 100644 --- a/tools/pyproject.toml +++ b/tools/pyproject.toml @@ -1,6 +1,80 @@ -[tool.black] +[tool.ruff] line-length = 120 -target_version = ['py310'] +src = ["tools"] +target-version = "py311" + +[tool.ruff.lint] +select = [ + "F", # Errors detected by Pyflakes + "E", # Error detected by Pycodestyle + "W", # Warning detected by Pycodestyle + "I", # isort + "D", # pydocstyle + "B", # flake8-bugbear + "TID", # flake8-tidy-imports + "C4", # flake8-comprehensions + "BLE", # flake8-blind-except + "UP", # pyupgrade + "RUF100", # Report unused noqa directives +] +ignore = [ + # line too long -> we accept long comment lines; formatter gets rid of long code lines + "E501", + # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient + "E731", + # allow I, O, l as variable names -> I is the identity matrix + "E741", + # Missing docstring in public package + "D104", + # Missing docstring in public module + "D100", + # Missing docstring in __init__ + "D107", + # Errors from function calls in argument defaults. These are fine when the result is immutable. + "B008", + # __magic__ methods are are often self-explanatory, allow missing docstrings + "D105", + # first line should end with a period [Bug: doesn't work with single-line docstrings] + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + ## Disable one in each pair of mutually incompatible rules + # We don’t want a blank line before a class docstring + "D203", + # We want docstrings to start immediately after the opening triple quote + "D213", + # Missing argument description in the docstring TODO: enable + "D417", + # Blank line required between summary line and description TODO: enable + "D205", + # Prefer absolute imports over relative imports from parent modules TODO: enable + "TID252", + # Missing docstring in public class TODO: enable + "D101", + # Missing docstring in public method TODO: enable + "D102", + # Missing docstring in public function TODO: enable + "D103", +] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"*/__init__.py" = ["F401"] +"*/tests/*" = ["D"] +# Module level import not at top of cell +"*/*.ipynb" = ["E402"] + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" [tool.mypy] show_error_codes = true @@ -12,9 +86,3 @@ plugins = "numpy.typing.mypy_plugin" [[tool.mypy.overrides]] module = "tools.models.scvi.*" ignore_errors = true - -[tool.ruff] -select = ["E", "F", "B", "I"] -ignore = ["E501", "E402", "C408", ] -line-length = 120 -target-version = "py39"