From 77d28a5368c19d90050f41b45e0e07d5224d3283 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 8 Mar 2024 15:42:02 -0800 Subject: [PATCH] lint part 1 --- .../src/cellxgene_census/_get_anndata.py | 10 ++-- .../src/cellxgene_census/_util.py | 6 +- .../cellxgene_census/experimental/__init__.py | 8 ++- .../experimental/_embedding.py | 57 ++++++++++++------- .../tests/experimental/test_embeddings.py | 38 +++++++++---- .../tests/test_get_anndata.py | 1 + .../cellxgene_census/tests/test_util.py | 13 +++-- .../src/census_contrib/metadata.py | 2 +- 8 files changed, 87 insertions(+), 48 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index e3056d0e8..e251c8f0b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,9 +12,8 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord -from ._util import _extract_census_version - from ._experiment import _get_experiment +from ._util import _extract_census_version CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma" @@ -103,13 +102,14 @@ def get_anndata( column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, - varm_layers=varm_layers + varm_layers=varm_layers, ) # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot - if add_obs_embeddings or add_var_embeddings: + if add_obs_embeddings is not None: obs_soma_joinids = query.obs_joinids() from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name + census_version = _extract_census_version(census) for emb in add_obs_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, organism, census_version, "obs_embedding") @@ -118,5 +118,3 @@ def get_anndata( adata.obsm[emb] = embedding return adata - - diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 93b8c6caa..cd26a3ad8 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -1,4 +1,5 @@ import urllib.parse + import tiledbsoma as soma @@ -20,6 +21,7 @@ def _uri_join(base: str, url: str) -> str: ] return urllib.parse.urlunparse(parts) -def _extract_census_version(census: soma.Collection): + +def _extract_census_version(census: soma.Collection) -> str: """Extract the Census version from the given Census object.""" - return urllib.parse.urlparse(census.uri).path.split("/")[2] \ No newline at end of file + return urllib.parse.urlparse(census.uri).path.split("/")[2] diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py index e09759bc2..4a65cb3ab 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py @@ -1,6 +1,12 @@ """Experimental API for the CELLxGENE Discover Census.""" -from ._embedding import get_embedding, get_embedding_metadata, get_embedding_metadata_by_name, get_all_available_embeddings, get_all_census_versions_with_embedding +from ._embedding import ( + get_all_available_embeddings, + get_all_census_versions_with_embedding, + get_embedding, + get_embedding_metadata, + get_embedding_metadata_by_name, +) __all__ = [ "get_embedding", diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 6c29b32f7..5753e9c23 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -13,8 +13,8 @@ import numpy.typing as npt import pandas as pd import pyarrow as pa -import tiledbsoma as soma import requests +import tiledbsoma as soma from .._open import get_default_soma_context, open_soma from .._release_directory import get_census_version_directory @@ -140,8 +140,11 @@ def get_embedding( return embedding -def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding") -> dict[str, Any]: - """Return metadata for a specific embedding. If more embeddings match the query parameters, + +def get_embedding_metadata_by_name( + embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding" +) -> dict[str, Any]: + """Return metadata for a specific embedding. If more embeddings match the query parameters, the most recent one will be returned. Args: @@ -167,14 +170,20 @@ def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_ve manifest = response.json() embeddings = [] for _, obj in manifest.items(): - if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type and obj["census_version"] == census_version: + if ( + obj["embedding_name"] == embedding_name + and obj["experiment_name"] == organism + and obj["data_type"] == embedding_type + and obj["census_version"] == census_version + ): embeddings.append(obj) if len(embeddings) == 0: raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}") - + return sorted(embeddings, key=lambda x: x["submission_date"])[-1] + def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: """Return a dictionary of all available embeddings for a given Census version. @@ -188,19 +197,19 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: Examples: >>> get_all_available_embeddings('2023-12-15') [{ - 'experiment_name': 'experiment_1', - 'measurement_name': 'RNA', - 'organism': "homo_sapiens", - 'census_version': '2023-12-15', - 'n_embeddings': 1000, - 'n_features': 200, + 'experiment_name': 'experiment_1', + 'measurement_name': 'RNA', + 'organism': "homo_sapiens", + 'census_version': '2023-12-15', + 'n_embeddings': 1000, + 'n_features': 200, 'uri': 's3://bucket/embedding_1' }] """ response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - + embeddings = [] manifest = response.json() for _, obj in manifest.items(): @@ -209,16 +218,18 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: return embeddings -def get_all_census_versions_with_embedding(embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding") -> list[str]: - """ - Get a list of all census versions that contain a specific embedding. + +def get_all_census_versions_with_embedding( + embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding" +) -> list[str]: + """Get a list of all census versions that contain a specific embedding. Args: - embedding_name: + embedding_name: The name of the embedding, e.g. "scvi". - organism: + organism: The organism for which the embedding is associated. - embedding_type: + embedding_type: The type of embedding. Defaults to "obs_embedding". Returns: @@ -226,11 +237,15 @@ def get_all_census_versions_with_embedding(embedding_name: str, organism: str, e """ response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - + versions = set() manifest = response.json() for _, obj in manifest.items(): - if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type: + if ( + obj["embedding_name"] == embedding_name + and obj["experiment_name"] == organism + and obj["data_type"] == embedding_type + ): versions.add(obj["census_version"]) - return sorted(list(versions)) \ No newline at end of file + return sorted(versions) diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py index 331f5a202..aeb0ff661 100644 --- a/api/python/cellxgene_census/tests/experimental/test_embeddings.py +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py @@ -1,8 +1,11 @@ import pytest import requests_mock as rm -from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding, get_embedding_metadata_by_name - +from cellxgene_census.experimental import ( + get_all_available_embeddings, + get_all_census_versions_with_embedding, + get_embedding_metadata_by_name, +) from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL @@ -16,7 +19,7 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: "experiment_name": "homo_sapiens", "data_type": "obs_embedding", "census_version": "2023-12-15", - "submission_date": "2023-11-15" + "submission_date": "2023-11-15", }, "embedding-id-2": { "id": "embedding-id-2", @@ -42,23 +45,34 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: requests_mock.real_http = True requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) - embedding = get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + embedding = get_embedding_metadata_by_name( + "emb_1", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) assert embedding is not None - assert embedding["id"] == "embedding-id-2" # most recent version + assert embedding["id"] == "embedding-id-2" # most recent version assert embedding == mock_embeddings["embedding-id-2"] - embedding = get_embedding_metadata_by_name("emb_3", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + embedding = get_embedding_metadata_by_name( + "emb_3", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) assert embedding is not None - assert embedding["id"] == "embedding-id-3" + assert embedding["id"] == "embedding-id-3" assert embedding == mock_embeddings["embedding-id-3"] with pytest.raises(ValueError): - get_embedding_metadata_by_name("emb_2", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") - get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "obs_embedding") - get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-10-15", embedding_type = "obs_embedding") - get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "var_embedding") + get_embedding_metadata_by_name( + "emb_2", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="homo_sapiens", census_version="2023-10-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="var_embedding" + ) - def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: mock_embeddings = { diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index d0716d770..be36c2db8 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -194,6 +194,7 @@ def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[ assert obsm_layer in ad.obsm.keys() assert ad.obsm[obsm_layer].shape[0] == 100 + @pytest.mark.live_corpus @pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer"]]) def test_get_anndata_add_obs_embeddings(census: soma.Collection, add_obs_embeddings: List[str]) -> None: diff --git a/api/python/cellxgene_census/tests/test_util.py b/api/python/cellxgene_census/tests/test_util.py index a9e6162df..3e5a82d68 100644 --- a/api/python/cellxgene_census/tests/test_util.py +++ b/api/python/cellxgene_census/tests/test_util.py @@ -1,8 +1,10 @@ -from cellxgene_census._util import _uri_join, _extract_census_version -import cellxgene_census -import pytest import re +import pytest + +import cellxgene_census +from cellxgene_census._util import _extract_census_version, _uri_join + def test_uri_join() -> None: assert _uri_join("https://foo/", "bar") == "https://foo/bar" @@ -23,11 +25,12 @@ def test_uri_join() -> None: assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b" + @pytest.mark.live_corpus def test_extract_census_version() -> None: """Ensures that extracting the Census version from a Collection object does not break""" - pattern = r'^\d{4}-\d{2}-\d{2}$' + pattern = r"^\d{4}-\d{2}-\d{2}$" with cellxgene_census.open_soma(census_version="stable") as census: assert census is not None @@ -37,4 +40,4 @@ def test_extract_census_version() -> None: with cellxgene_census.open_soma(census_version="latest") as census: assert census is not None version = _extract_census_version(census) - assert re.match(pattern, version) \ No newline at end of file + assert re.match(pattern, version) diff --git a/tools/census_contrib/src/census_contrib/metadata.py b/tools/census_contrib/src/census_contrib/metadata.py index faae70363..486e49c23 100644 --- a/tools/census_contrib/src/census_contrib/metadata.py +++ b/tools/census_contrib/src/census_contrib/metadata.py @@ -144,7 +144,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding # 7. Name must have length < 128 characters MAX_NAME_LENGTH = 128 - if not metadata.name or len(metadata.name) > MAX_NAME_LENGTH: + if not metadata.embedding_name or len(metadata.embedding_name) > MAX_NAME_LENGTH: raise ValueError( f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length", )