From d126590ead4c72cdcec456a8646931e3c1e11d41 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Wed, 28 Feb 2024 10:49:07 -0800 Subject: [PATCH 01/14] Draft --- .../src/cellxgene_census/_get_anndata.py | 7 +++ .../experimental/_embedding.py | 45 +++++++++++++++++++ tools/census_contrib/embedding_metadata.md | 2 + .../src/census_contrib/metadata.py | 9 ++++ 4 files changed, 63 insertions(+) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 54018796f..03c155063 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -22,6 +22,7 @@ def get_anndata( X_name: str = "raw", X_layers: Optional[Sequence[str]] = (), obsm_layers: Optional[Sequence[str]] = (), + varm_layers: Optional[Sequence[str]] = (), obs_value_filter: Optional[str] = None, obs_coords: Optional[SparseDFCoord] = None, var_value_filter: Optional[str] = None, @@ -58,6 +59,12 @@ def get_anndata( Columns to fetch for ``obs`` and ``var`` dataframes. obsm_layers: Additional obsm layers to read and return in the ``obsm`` slot. + Use :func:`get_all_available_embeddings` to retrieve available embeddings + for this Census version and organism. + varm_layers: + Additional varm layers to read and return in the ``varm`` slot. + Use :func:`get_all_available_embeddings` to retrieve available embeddings + for this Census version and organism. Returns: An :class:`anndata.AnnData` object containing the census slice. diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index fc6b21495..4ed53bfe4 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -136,3 +136,48 @@ def get_embedding( np.put(embedding.reshape(-1), indices, emb) return embedding + + +def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: + """Return a dictionary of all available embeddings for a given Census version. + + Args: + census_version: + The Census version tag, e.g., ``"2023-12-15"``. + + Returns: + A list of dictionaries, each containing metadata describing an available embedding. + + Examples: + >>> get_all_available_embeddings('2023-12-15') + [{ + 'experiment_name': 'experiment_1', + 'measurement_name': 'RNA', + 'organism': "homo_sapiens", + 'census_version': '2023-12-15', + 'n_embeddings': 1000, + 'n_features': 200, + 'uri': 's3://bucket/embedding_1' + }] + + """ + pass + + +def get_all_census_versions_with_embedding( + embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding" +) -> list[str]: + """Get a list of all census versions that contain a specific embedding. + + Args: + embedding_name: + The name of the embedding. + organism: + The organism for which the embedding is associated. + embedding_type: + The type of embedding. Defaults to "obs_embedding". + + Returns: + A list of census versions that contain the specified embedding. + """ + pass diff --git a/tools/census_contrib/embedding_metadata.md b/tools/census_contrib/embedding_metadata.md index 0a8a7a141..b909e140a 100644 --- a/tools/census_contrib/embedding_metadata.md +++ b/tools/census_contrib/embedding_metadata.md @@ -21,6 +21,7 @@ Each embedding will contain a variety of metadata stored in the SOMA `metadata` | Field name | Required | Type | Description | | ---------------------- | -------- | ------------- | ------------------------------------------------------------------------------------- | | id | required | string | CZI-assigned accession ID for this embedding | +| embedding_name | required | string | Name of the algorithm used to generate the embedding | | title | required | string | Brief project title | | description | required | string | Succinct description of the method and characteristics of the embeddings and model | | primary_contact | required | Contact | Primary contact person for these embeddings. | @@ -49,6 +50,7 @@ For example: ```json { "id": "CxG-contrib-99999", + "embedding_name": "model_algo", "title": "An embedding", "description": "Longer description of the embedding and method used to generate it", "primary_contact": { diff --git a/tools/census_contrib/src/census_contrib/metadata.py b/tools/census_contrib/src/census_contrib/metadata.py index f4f283ead..280d5ca2f 100644 --- a/tools/census_contrib/src/census_contrib/metadata.py +++ b/tools/census_contrib/src/census_contrib/metadata.py @@ -34,6 +34,7 @@ class Contact: @attrs.define(kw_only=True, frozen=True) class EmbeddingMetadata: id: str = field(validator=validators.instance_of(str)) + embedding_name: str = field(validator=validators.instance_of(str)) title: str = field(validator=validators.instance_of(str)) description: str = field(validator=validators.instance_of(str)) primary_contact: Contact = field(validator=validators.instance_of(Contact)) @@ -118,6 +119,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding 4. All supplied URLs must resolve 5. Title must have length < 128 characters 6. Description must have length < 2048 characters + 7. Name must have length < 24 characters """ if not metadata.id: raise ValueError("metadata is missing 'id' (accession)") @@ -140,6 +142,13 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding "Metadata: description must be string between 1 and {MAX_DESCRIPTION_LENGTH} characters in length", ) + # 7. Name must have length < 24 characters + MAX_NAME_LENGTH = 24 + if not metadata.name or len(metadata.name) > MAX_NAME_LENGTH: + raise ValueError( + f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length", + ) + return metadata From c221799fccc8ccf4674530b34a67522111620a75 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Wed, 28 Feb 2024 13:27:31 -0800 Subject: [PATCH 02/14] Feedback --- .../src/cellxgene_census/_get_anndata.py | 10 ++++++++-- tools/census_contrib/src/census_contrib/metadata.py | 6 +++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 03c155063..6ce220807 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -28,6 +28,8 @@ def get_anndata( var_value_filter: Optional[str] = None, var_coords: Optional[SparseDFCoord] = None, column_names: Optional[soma.AxisColumnNames] = None, + add_obs_embeddings: Optional[Sequence[str]] = (), + add_var_embeddings: Optional[Sequence[str]] = (), ) -> anndata.AnnData: """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. @@ -59,10 +61,14 @@ def get_anndata( Columns to fetch for ``obs`` and ``var`` dataframes. obsm_layers: Additional obsm layers to read and return in the ``obsm`` slot. - Use :func:`get_all_available_embeddings` to retrieve available embeddings - for this Census version and organism. varm_layers: Additional varm layers to read and return in the ``varm`` slot. + add_obs_embeddings: + Embeddings to be returned as part of the ``obsm`` slot. + Use :func:`get_all_available_embeddings` to retrieve available embeddings + for this Census version and organism. + add_var_embeddings: + Embeddings to be returned as part of the ``varm`` slot. Use :func:`get_all_available_embeddings` to retrieve available embeddings for this Census version and organism. diff --git a/tools/census_contrib/src/census_contrib/metadata.py b/tools/census_contrib/src/census_contrib/metadata.py index 280d5ca2f..faae70363 100644 --- a/tools/census_contrib/src/census_contrib/metadata.py +++ b/tools/census_contrib/src/census_contrib/metadata.py @@ -119,7 +119,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding 4. All supplied URLs must resolve 5. Title must have length < 128 characters 6. Description must have length < 2048 characters - 7. Name must have length < 24 characters + 7. Name must have length < 128 characters """ if not metadata.id: raise ValueError("metadata is missing 'id' (accession)") @@ -142,8 +142,8 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding "Metadata: description must be string between 1 and {MAX_DESCRIPTION_LENGTH} characters in length", ) - # 7. Name must have length < 24 characters - MAX_NAME_LENGTH = 24 + # 7. Name must have length < 128 characters + MAX_NAME_LENGTH = 128 if not metadata.name or len(metadata.name) > MAX_NAME_LENGTH: raise ValueError( f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length", From 43534b022e2b8e1ea62fc60c79253ae49920f25f Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Thu, 29 Feb 2024 15:58:20 -0800 Subject: [PATCH 03/14] Add basic unit test --- .../tests/experimental/test_embeddings.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 api/python/cellxgene_census/tests/experimental/test_embeddings.py diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py new file mode 100644 index 000000000..1ed1e3dde --- /dev/null +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py @@ -0,0 +1,109 @@ +import pytest +import requests_mock as rm + +from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding + + +@pytest.mark.live_corpus +def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "measurement_name": "RNA", + "n_embeddings": 1000, + "n_features": 200, + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_2", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "measurement_name": "RNA", + "n_embeddings": 1000, + "n_features": 200, + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + embeddings = get_all_available_embeddings("2023-12-15") + assert embeddings is not None + assert len(embeddings) == 2 + + # Query for a non existing version of the Census + embeddings = get_all_available_embeddings("2024-12-15") + assert len(embeddings) == 0 + + +@pytest.mark.live_corpus +def test_get_all_census_versions_with_embedding(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_1", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-3": { + "id": "embedding-id-3", + "embedding_name": "emb_1", + "title": "Embedding 3", + "description": "Third embedding", + "experiment_name": "mus_musculus", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-4": { + "id": "embedding-id-4", + "embedding_name": "emb_1", + "title": "Embedding 4", + "description": "Fourth embedding", + "experiment_name": "mus_musculus", + "data_type": "obs_embedding", + "census_version": "2024-01-01", + }, + "embedding-id-5": { + "id": "embedding-id-5", + "embedding_name": "emb_2", + "title": "Embedding 5", + "description": "Fifth embedding", + "experiment_name": "mus_musculus", + "data_type": "var_embedding", + "census_version": "2023-12-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + versions = get_all_census_versions_with_embedding("emb_1", organism="homo_sapiens", embedding_type="obs_embedding") + assert versions == ["2023-12-15"] + + versions = get_all_census_versions_with_embedding("emb_1", organism="mus_musculus", embedding_type="obs_embedding") + assert versions == ["2023-12-15", "2024-01-01"] + + versions = get_all_census_versions_with_embedding("emb_1", organism="mus_musculus", embedding_type="var_embedding") + assert versions == [] + + versions = get_all_census_versions_with_embedding("emb_2", organism="mus_musculus", embedding_type="var_embedding") + assert versions == ["2023-12-15"] From b0472e39fa558bd68bf4fb54a3df727046542046 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Thu, 29 Feb 2024 16:04:05 -0800 Subject: [PATCH 04/14] Add basic unit test, pass 2 --- .../tests/test_get_anndata.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index 9c079cbb7..d0716d770 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -158,7 +158,6 @@ def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None: assert raise_info.value.args[0] == "Unknown X layer name" -@pytest.mark.skip(reason="Enable when obsm is available in a live Census distribution.") @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layer", ["scvi", "geneformer"]) def test_get_anndata_obsm_one_layer(census: soma.Collection, obsm_layer: str) -> None: @@ -177,7 +176,6 @@ def test_get_anndata_obsm_one_layer(census: soma.Collection, obsm_layer: str) -> assert ad.obsm[obsm_layer].shape[0] == 100 -@pytest.mark.skip(reason="Enable when obsm is available in a live Census distribution.") @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layers", [["scvi", "geneformer"]]) def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[str]) -> None: @@ -195,3 +193,21 @@ def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[ for obsm_layer in obsm_layers: assert obsm_layer in ad.obsm.keys() assert ad.obsm[obsm_layer].shape[0] == 100 + +@pytest.mark.live_corpus +@pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer"]]) +def test_get_anndata_add_obs_embeddings(census: soma.Collection, add_obs_embeddings: List[str]) -> None: + with census: + ad = cellxgene_census.get_anndata( + census, + organism="Homo sapiens", + X_name="raw", + obs_coords=slice(100), + var_coords=slice(200), + add_obs_embeddings=add_obs_embeddings, + ) + + assert len(ad.obsm.keys()) == 2 + for obsm_layer in add_obs_embeddings: + assert obsm_layer in ad.obsm.keys() + assert ad.obsm[obsm_layer].shape[0] == 100 From ddafdf0abb69ae2fc86174ff9a34253c0b028e3c Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 1 Mar 2024 10:32:31 -0800 Subject: [PATCH 05/14] Checkpoint --- .../src/cellxgene_census/_get_anndata.py | 13 ++- .../src/cellxgene_census/_util.py | 5 ++ .../cellxgene_census/experimental/__init__.py | 5 +- .../experimental/_embedding.py | 89 +++++++++++++++---- .../tests/experimental/test_embeddings.py | 60 ++++++++++++- .../cellxgene_census/tests/test_util.py | 21 ++++- 6 files changed, 169 insertions(+), 24 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 6ce220807..d8a2d7515 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,6 +12,8 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord +from util import _extract_census_version + from ._experiment import _get_experiment @@ -88,14 +90,23 @@ def get_anndata( exp = _get_experiment(census, organism) obs_coords = (slice(None),) if obs_coords is None else (obs_coords,) var_coords = (slice(None),) if var_coords is None else (var_coords,) + with exp.axis_query( measurement_name, obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords), var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords), ) as query: - return query.to_anndata( + adata = query.to_anndata( X_name=X_name, column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, + varm_layers=varm_layers ) + + # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot + if add_obs_embeddings or add_var_embeddings: + from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name + census_version = _extract_census_version(census) + get_embedding_metadata_by_name() + diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 8b7e5685b..93b8c6caa 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -1,4 +1,5 @@ import urllib.parse +import tiledbsoma as soma def _uri_join(base: str, url: str) -> str: @@ -18,3 +19,7 @@ def _uri_join(base: str, url: str) -> str: p_url.fragment, ] return urllib.parse.urlunparse(parts) + +def _extract_census_version(census: soma.Collection): + """Extract the Census version from the given Census object.""" + return urllib.parse.urlparse(census.uri).path.split("/")[2] \ No newline at end of file diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py index c37c08789..e09759bc2 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py @@ -1,8 +1,11 @@ """Experimental API for the CELLxGENE Discover Census.""" -from ._embedding import get_embedding, get_embedding_metadata +from ._embedding import get_embedding, get_embedding_metadata, get_embedding_metadata_by_name, get_all_available_embeddings, get_all_census_versions_with_embedding __all__ = [ "get_embedding", "get_embedding_metadata", + "get_embedding_metadata_by_name", + "get_all_available_embeddings", + "get_all_census_versions_with_embedding", ] diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 4ed53bfe4..6c29b32f7 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -14,10 +14,13 @@ import pandas as pd import pyarrow as pa import tiledbsoma as soma +import requests from .._open import get_default_soma_context, open_soma from .._release_directory import get_census_version_directory +CELL_CENSUS_EMBEDDINGS_MANIFEST_URL = "https://contrib.cellxgene.cziscience.com/contrib/cell-census/contributions.json" + def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBContext | None = None) -> dict[str, Any]: """Read embedding metadata and return as a Python dict. @@ -137,6 +140,40 @@ def get_embedding( return embedding +def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding") -> dict[str, Any]: + """Return metadata for a specific embedding. If more embeddings match the query parameters, + the most recent one will be returned. + + Args: + embedding_name: + The name of the embedding, e.g. "scvi". + organism: + The organism for which the embedding is associated. + census_version: + The Census version tag, e.g., ``"2023-12-15"``. + embedding_type: + Either "obs_embedding" or "var_embedding". Defaults to "obs_embedding". + + Returns: + A dictionary containing metadata describing the embedding. + + Raises: + ValueError: if no embeddings are found for the specified query parameters. + + """ + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + manifest = response.json() + embeddings = [] + for _, obj in manifest.items(): + if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type and obj["census_version"] == census_version: + embeddings.append(obj) + + if len(embeddings) == 0: + raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}") + + return sorted(embeddings, key=lambda x: x["submission_date"])[-1] def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: """Return a dictionary of all available embeddings for a given Census version. @@ -151,33 +188,49 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: Examples: >>> get_all_available_embeddings('2023-12-15') [{ - 'experiment_name': 'experiment_1', - 'measurement_name': 'RNA', - 'organism': "homo_sapiens", - 'census_version': '2023-12-15', - 'n_embeddings': 1000, - 'n_features': 200, + 'experiment_name': 'experiment_1', + 'measurement_name': 'RNA', + 'organism': "homo_sapiens", + 'census_version': '2023-12-15', + 'n_embeddings': 1000, + 'n_features': 200, 'uri': 's3://bucket/embedding_1' }] """ - pass - - -def get_all_census_versions_with_embedding( - embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding" -) -> list[str]: - """Get a list of all census versions that contain a specific embedding. + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + embeddings = [] + manifest = response.json() + for _, obj in manifest.items(): + if obj["census_version"] == census_version: + embeddings.append(obj) + + return embeddings + +def get_all_census_versions_with_embedding(embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding") -> list[str]: + """ + Get a list of all census versions that contain a specific embedding. Args: - embedding_name: - The name of the embedding. - organism: + embedding_name: + The name of the embedding, e.g. "scvi". + organism: The organism for which the embedding is associated. - embedding_type: + embedding_type: The type of embedding. Defaults to "obs_embedding". Returns: A list of census versions that contain the specified embedding. """ - pass + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + versions = set() + manifest = response.json() + for _, obj in manifest.items(): + if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type: + versions.add(obj["census_version"]) + + return sorted(list(versions)) \ No newline at end of file diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py index 1ed1e3dde..331f5a202 100644 --- a/api/python/cellxgene_census/tests/experimental/test_embeddings.py +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py @@ -1,10 +1,65 @@ import pytest import requests_mock as rm -from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding +from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding, get_embedding_metadata_by_name +from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL + + +def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-11-15" + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_1", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-12-31", + }, + "embedding-id-3": { + "id": "embedding-id-3", + "embedding_name": "emb_3", + "title": "Embedding 3", + "description": "Third embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-11-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + embedding = get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + assert embedding is not None + assert embedding["id"] == "embedding-id-2" # most recent version + assert embedding == mock_embeddings["embedding-id-2"] + + embedding = get_embedding_metadata_by_name("emb_3", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + assert embedding is not None + assert embedding["id"] == "embedding-id-3" + assert embedding == mock_embeddings["embedding-id-3"] + + with pytest.raises(ValueError): + get_embedding_metadata_by_name("emb_2", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "obs_embedding") + get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-10-15", embedding_type = "obs_embedding") + get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "var_embedding") + + -@pytest.mark.live_corpus def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: mock_embeddings = { "embedding-id-1": { @@ -44,7 +99,6 @@ def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: assert len(embeddings) == 0 -@pytest.mark.live_corpus def test_get_all_census_versions_with_embedding(requests_mock: rm.Mocker) -> None: mock_embeddings = { "embedding-id-1": { diff --git a/api/python/cellxgene_census/tests/test_util.py b/api/python/cellxgene_census/tests/test_util.py index d7f08902b..a9e6162df 100644 --- a/api/python/cellxgene_census/tests/test_util.py +++ b/api/python/cellxgene_census/tests/test_util.py @@ -1,4 +1,7 @@ -from cellxgene_census._util import _uri_join +from cellxgene_census._util import _uri_join, _extract_census_version +import cellxgene_census +import pytest +import re def test_uri_join() -> None: @@ -19,3 +22,19 @@ def test_uri_join() -> None: assert _uri_join("file:///foo/bar", "a") == "file:///foo/a" assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b" + +@pytest.mark.live_corpus +def test_extract_census_version() -> None: + """Ensures that extracting the Census version from a Collection object does not break""" + + pattern = r'^\d{4}-\d{2}-\d{2}$' + + with cellxgene_census.open_soma(census_version="stable") as census: + assert census is not None + version = _extract_census_version(census) + assert re.match(pattern, version) + + with cellxgene_census.open_soma(census_version="latest") as census: + assert census is not None + version = _extract_census_version(census) + assert re.match(pattern, version) \ No newline at end of file From 337b571f9d499e06554798302854828cf8e18536 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 1 Mar 2024 13:58:33 -0800 Subject: [PATCH 06/14] Checkpoint --- .../src/cellxgene_census/_get_anndata.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index d8a2d7515..e3056d0e8 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,10 +12,12 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord -from util import _extract_census_version +from ._util import _extract_census_version from ._experiment import _get_experiment +CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma" + def get_anndata( census: soma.Collection, @@ -106,7 +108,15 @@ def get_anndata( # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot if add_obs_embeddings or add_var_embeddings: + obs_soma_joinids = query.obs_joinids() from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name census_version = _extract_census_version(census) - get_embedding_metadata_by_name() + for emb in add_obs_embeddings: + emb_metadata = get_embedding_metadata_by_name(emb, organism, census_version, "obs_embedding") + uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" + embedding = get_embedding(census_version, uri, obs_soma_joinids) + adata.obsm[emb] = embedding + + return adata + From 77d28a5368c19d90050f41b45e0e07d5224d3283 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 8 Mar 2024 15:42:02 -0800 Subject: [PATCH 07/14] lint part 1 --- .../src/cellxgene_census/_get_anndata.py | 10 ++-- .../src/cellxgene_census/_util.py | 6 +- .../cellxgene_census/experimental/__init__.py | 8 ++- .../experimental/_embedding.py | 57 ++++++++++++------- .../tests/experimental/test_embeddings.py | 38 +++++++++---- .../tests/test_get_anndata.py | 1 + .../cellxgene_census/tests/test_util.py | 13 +++-- .../src/census_contrib/metadata.py | 2 +- 8 files changed, 87 insertions(+), 48 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index e3056d0e8..e251c8f0b 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,9 +12,8 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord -from ._util import _extract_census_version - from ._experiment import _get_experiment +from ._util import _extract_census_version CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma" @@ -103,13 +102,14 @@ def get_anndata( column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, - varm_layers=varm_layers + varm_layers=varm_layers, ) # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot - if add_obs_embeddings or add_var_embeddings: + if add_obs_embeddings is not None: obs_soma_joinids = query.obs_joinids() from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name + census_version = _extract_census_version(census) for emb in add_obs_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, organism, census_version, "obs_embedding") @@ -118,5 +118,3 @@ def get_anndata( adata.obsm[emb] = embedding return adata - - diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 93b8c6caa..cd26a3ad8 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -1,4 +1,5 @@ import urllib.parse + import tiledbsoma as soma @@ -20,6 +21,7 @@ def _uri_join(base: str, url: str) -> str: ] return urllib.parse.urlunparse(parts) -def _extract_census_version(census: soma.Collection): + +def _extract_census_version(census: soma.Collection) -> str: """Extract the Census version from the given Census object.""" - return urllib.parse.urlparse(census.uri).path.split("/")[2] \ No newline at end of file + return urllib.parse.urlparse(census.uri).path.split("/")[2] diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py index e09759bc2..4a65cb3ab 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py @@ -1,6 +1,12 @@ """Experimental API for the CELLxGENE Discover Census.""" -from ._embedding import get_embedding, get_embedding_metadata, get_embedding_metadata_by_name, get_all_available_embeddings, get_all_census_versions_with_embedding +from ._embedding import ( + get_all_available_embeddings, + get_all_census_versions_with_embedding, + get_embedding, + get_embedding_metadata, + get_embedding_metadata_by_name, +) __all__ = [ "get_embedding", diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 6c29b32f7..5753e9c23 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -13,8 +13,8 @@ import numpy.typing as npt import pandas as pd import pyarrow as pa -import tiledbsoma as soma import requests +import tiledbsoma as soma from .._open import get_default_soma_context, open_soma from .._release_directory import get_census_version_directory @@ -140,8 +140,11 @@ def get_embedding( return embedding -def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding") -> dict[str, Any]: - """Return metadata for a specific embedding. If more embeddings match the query parameters, + +def get_embedding_metadata_by_name( + embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding" +) -> dict[str, Any]: + """Return metadata for a specific embedding. If more embeddings match the query parameters, the most recent one will be returned. Args: @@ -167,14 +170,20 @@ def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_ve manifest = response.json() embeddings = [] for _, obj in manifest.items(): - if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type and obj["census_version"] == census_version: + if ( + obj["embedding_name"] == embedding_name + and obj["experiment_name"] == organism + and obj["data_type"] == embedding_type + and obj["census_version"] == census_version + ): embeddings.append(obj) if len(embeddings) == 0: raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}") - + return sorted(embeddings, key=lambda x: x["submission_date"])[-1] + def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: """Return a dictionary of all available embeddings for a given Census version. @@ -188,19 +197,19 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: Examples: >>> get_all_available_embeddings('2023-12-15') [{ - 'experiment_name': 'experiment_1', - 'measurement_name': 'RNA', - 'organism': "homo_sapiens", - 'census_version': '2023-12-15', - 'n_embeddings': 1000, - 'n_features': 200, + 'experiment_name': 'experiment_1', + 'measurement_name': 'RNA', + 'organism': "homo_sapiens", + 'census_version': '2023-12-15', + 'n_embeddings': 1000, + 'n_features': 200, 'uri': 's3://bucket/embedding_1' }] """ response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - + embeddings = [] manifest = response.json() for _, obj in manifest.items(): @@ -209,16 +218,18 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: return embeddings -def get_all_census_versions_with_embedding(embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding") -> list[str]: - """ - Get a list of all census versions that contain a specific embedding. + +def get_all_census_versions_with_embedding( + embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding" +) -> list[str]: + """Get a list of all census versions that contain a specific embedding. Args: - embedding_name: + embedding_name: The name of the embedding, e.g. "scvi". - organism: + organism: The organism for which the embedding is associated. - embedding_type: + embedding_type: The type of embedding. Defaults to "obs_embedding". Returns: @@ -226,11 +237,15 @@ def get_all_census_versions_with_embedding(embedding_name: str, organism: str, e """ response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - + versions = set() manifest = response.json() for _, obj in manifest.items(): - if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type: + if ( + obj["embedding_name"] == embedding_name + and obj["experiment_name"] == organism + and obj["data_type"] == embedding_type + ): versions.add(obj["census_version"]) - return sorted(list(versions)) \ No newline at end of file + return sorted(versions) diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py index 331f5a202..aeb0ff661 100644 --- a/api/python/cellxgene_census/tests/experimental/test_embeddings.py +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py @@ -1,8 +1,11 @@ import pytest import requests_mock as rm -from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding, get_embedding_metadata_by_name - +from cellxgene_census.experimental import ( + get_all_available_embeddings, + get_all_census_versions_with_embedding, + get_embedding_metadata_by_name, +) from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL @@ -16,7 +19,7 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: "experiment_name": "homo_sapiens", "data_type": "obs_embedding", "census_version": "2023-12-15", - "submission_date": "2023-11-15" + "submission_date": "2023-11-15", }, "embedding-id-2": { "id": "embedding-id-2", @@ -42,23 +45,34 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: requests_mock.real_http = True requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) - embedding = get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + embedding = get_embedding_metadata_by_name( + "emb_1", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) assert embedding is not None - assert embedding["id"] == "embedding-id-2" # most recent version + assert embedding["id"] == "embedding-id-2" # most recent version assert embedding == mock_embeddings["embedding-id-2"] - embedding = get_embedding_metadata_by_name("emb_3", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + embedding = get_embedding_metadata_by_name( + "emb_3", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) assert embedding is not None - assert embedding["id"] == "embedding-id-3" + assert embedding["id"] == "embedding-id-3" assert embedding == mock_embeddings["embedding-id-3"] with pytest.raises(ValueError): - get_embedding_metadata_by_name("emb_2", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") - get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "obs_embedding") - get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-10-15", embedding_type = "obs_embedding") - get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "var_embedding") + get_embedding_metadata_by_name( + "emb_2", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="homo_sapiens", census_version="2023-10-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="var_embedding" + ) - def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: mock_embeddings = { diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index d0716d770..be36c2db8 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -194,6 +194,7 @@ def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[ assert obsm_layer in ad.obsm.keys() assert ad.obsm[obsm_layer].shape[0] == 100 + @pytest.mark.live_corpus @pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer"]]) def test_get_anndata_add_obs_embeddings(census: soma.Collection, add_obs_embeddings: List[str]) -> None: diff --git a/api/python/cellxgene_census/tests/test_util.py b/api/python/cellxgene_census/tests/test_util.py index a9e6162df..3e5a82d68 100644 --- a/api/python/cellxgene_census/tests/test_util.py +++ b/api/python/cellxgene_census/tests/test_util.py @@ -1,8 +1,10 @@ -from cellxgene_census._util import _uri_join, _extract_census_version -import cellxgene_census -import pytest import re +import pytest + +import cellxgene_census +from cellxgene_census._util import _extract_census_version, _uri_join + def test_uri_join() -> None: assert _uri_join("https://foo/", "bar") == "https://foo/bar" @@ -23,11 +25,12 @@ def test_uri_join() -> None: assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b" + @pytest.mark.live_corpus def test_extract_census_version() -> None: """Ensures that extracting the Census version from a Collection object does not break""" - pattern = r'^\d{4}-\d{2}-\d{2}$' + pattern = r"^\d{4}-\d{2}-\d{2}$" with cellxgene_census.open_soma(census_version="stable") as census: assert census is not None @@ -37,4 +40,4 @@ def test_extract_census_version() -> None: with cellxgene_census.open_soma(census_version="latest") as census: assert census is not None version = _extract_census_version(census) - assert re.match(pattern, version) \ No newline at end of file + assert re.match(pattern, version) diff --git a/tools/census_contrib/src/census_contrib/metadata.py b/tools/census_contrib/src/census_contrib/metadata.py index faae70363..486e49c23 100644 --- a/tools/census_contrib/src/census_contrib/metadata.py +++ b/tools/census_contrib/src/census_contrib/metadata.py @@ -144,7 +144,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding # 7. Name must have length < 128 characters MAX_NAME_LENGTH = 128 - if not metadata.name or len(metadata.name) > MAX_NAME_LENGTH: + if not metadata.embedding_name or len(metadata.embedding_name) > MAX_NAME_LENGTH: raise ValueError( f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length", ) From c91b1e8888e99dfac5c1fc462dd52983979cbc7b Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 8 Mar 2024 16:22:29 -0800 Subject: [PATCH 08/14] Refactor variable --- api/python/cellxgene_census/src/cellxgene_census/_util.py | 3 ++- .../src/cellxgene_census/experimental/_embedding.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index cd26a3ad8..81992dbf7 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -24,4 +24,5 @@ def _uri_join(base: str, url: str) -> str: def _extract_census_version(census: soma.Collection) -> str: """Extract the Census version from the given Census object.""" - return urllib.parse.urlparse(census.uri).path.split("/")[2] + version: str = urllib.parse.urlparse(census.uri).path.split("/")[2] + return version diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 5753e9c23..392b38c02 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -167,7 +167,7 @@ def get_embedding_metadata_by_name( response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - manifest = response.json() + manifest = cast(dict[str, dict[str, Any]], response.json()) embeddings = [] for _, obj in manifest.items(): if ( From 1f53a94cc8c72404f7eedab83f8f4e558df60c64 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Thu, 14 Mar 2024 15:04:32 -0700 Subject: [PATCH 09/14] More work --- .../src/cellxgene_census/_experiment.py | 8 ++- .../src/cellxgene_census/_get_anndata.py | 27 +++++--- .../tests/test_get_anndata.py | 64 +++++++++++++++---- 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py index 4092d374a..308ee8bc6 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py @@ -12,6 +12,12 @@ import tiledbsoma as soma +def _get_experiment_name(organism: str) -> str: + """Given an organism name, return the experiment name.""" + # lower/snake case the organism name to find the experiment name + return re.sub(r"[ ]+", "_", organism).lower() + + def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment: """Given a census :class:`tiledbsoma.Collection`, return the experiment for the named organism. Organism matching is somewhat flexible, attempting to map from human-friendly @@ -40,7 +46,7 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment: >>> human = get_experiment(census, "homo_sapiens") """ # lower/snake case the organism name to find the experiment name - exp_name = re.sub(r"[ ]+", "_", organism).lower() + exp_name = _get_experiment_name(organism) if exp_name not in census["census_data"]: raise ValueError(f"Unknown organism {organism} - does not exist") diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index e251c8f0b..765fb857d 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,7 +12,7 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord -from ._experiment import _get_experiment +from ._experiment import _get_experiment, _get_experiment_name from ._util import _extract_census_version CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma" @@ -106,15 +106,26 @@ def get_anndata( ) # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot - if add_obs_embeddings is not None: - obs_soma_joinids = query.obs_joinids() + if add_obs_embeddings is not None or add_var_embeddings is not None: from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name census_version = _extract_census_version(census) - for emb in add_obs_embeddings: - emb_metadata = get_embedding_metadata_by_name(emb, organism, census_version, "obs_embedding") - uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" - embedding = get_embedding(census_version, uri, obs_soma_joinids) - adata.obsm[emb] = embedding + experiment_name = _get_experiment_name(organism) + + if add_obs_embeddings is not None: + obs_soma_joinids = query.obs_joinids() + for emb in add_obs_embeddings: + emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding") + uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" + embedding = get_embedding(census_version, uri, obs_soma_joinids) + adata.obsm[emb] = embedding + + if add_var_embeddings is not None: + var_soma_joinids = query.var_joinids() + for emb in add_var_embeddings: + emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding") + uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" + embedding = get_embedding(census_version, uri, var_soma_joinids) + adata.varm[emb] = embedding return adata diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index be36c2db8..7b8ddc54d 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -12,6 +12,11 @@ def census() -> soma.Collection: return cellxgene_census.open_soma(census_version="latest") +@pytest.fixture +def lts_census() -> soma.Collection: + return cellxgene_census.open_soma(census_version="stable") + + @pytest.mark.live_corpus def test_get_anndata_value_filter(census: soma.Collection) -> None: with census: @@ -160,10 +165,12 @@ def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None: @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layer", ["scvi", "geneformer"]) -def test_get_anndata_obsm_one_layer(census: soma.Collection, obsm_layer: str) -> None: - with census: +def test_get_anndata_obsm_one_layer(lts_census: soma.Collection, obsm_layer: str) -> None: + # NOTE: this test will break after next LTS release (>2023-12-15), since scvi and geneformer + # won't be distributed as part of `obsm_layers` anymore. Delete this test when it happens. + with lts_census: ad = cellxgene_census.get_anndata( - census, + lts_census, organism="Homo sapiens", X_name="raw", obs_coords=slice(100), @@ -173,15 +180,17 @@ def test_get_anndata_obsm_one_layer(census: soma.Collection, obsm_layer: str) -> assert len(ad.obsm.keys()) == 1 assert obsm_layer in ad.obsm.keys() - assert ad.obsm[obsm_layer].shape[0] == 100 + assert ad.obsm[obsm_layer].shape[0] == 101 @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layers", [["scvi", "geneformer"]]) -def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[str]) -> None: - with census: +def test_get_anndata_obsm_two_layers(lts_census: soma.Collection, obsm_layers: List[str]) -> None: + # NOTE: this test will break after next LTS release (>2023-12-15), since scvi and geneformer + # won't be distributed as part of `obsm_layers` anymore. Delete this test when it happens. + with lts_census: ad = cellxgene_census.get_anndata( - census, + lts_census, organism="Homo sapiens", X_name="raw", obs_coords=slice(100), @@ -192,15 +201,18 @@ def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[ assert len(ad.obsm.keys()) == 2 for obsm_layer in obsm_layers: assert obsm_layer in ad.obsm.keys() - assert ad.obsm[obsm_layer].shape[0] == 100 + assert ad.obsm[obsm_layer].shape[0] == 101 @pytest.mark.live_corpus -@pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer"]]) -def test_get_anndata_add_obs_embeddings(census: soma.Collection, add_obs_embeddings: List[str]) -> None: - with census: +@pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer", "uce"]]) +def test_get_anndata_add_obs_embeddings(lts_census: soma.Collection, add_obs_embeddings: List[str]) -> None: + # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, + # so this test could require adjustments. + + with lts_census: ad = cellxgene_census.get_anndata( - census, + lts_census, organism="Homo sapiens", X_name="raw", obs_coords=slice(100), @@ -208,7 +220,31 @@ def test_get_anndata_add_obs_embeddings(census: soma.Collection, add_obs_embeddi add_obs_embeddings=add_obs_embeddings, ) - assert len(ad.obsm.keys()) == 2 + assert len(ad.obsm.keys()) == 3 + assert len(ad.varm.keys()) == 0 for obsm_layer in add_obs_embeddings: assert obsm_layer in ad.obsm.keys() - assert ad.obsm[obsm_layer].shape[0] == 100 + assert ad.obsm[obsm_layer].shape[0] == 101 + + +@pytest.mark.live_corpus +@pytest.mark.parametrize("add_var_embeddings", [["nmf"]]) +def test_get_anndata_add_var_embeddings(lts_census: soma.Collection, add_var_embeddings: List[str]) -> None: + # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, + # so this test could require adjustments. + + with lts_census: + ad = cellxgene_census.get_anndata( + lts_census, + organism="Homo sapiens", + X_name="raw", + obs_coords=slice(100), + var_coords=slice(200), + add_var_embeddings=add_var_embeddings, + ) + + assert len(ad.obsm.keys()) == 0 + assert len(ad.varm.keys()) == 1 + for varm_layers in add_var_embeddings: + assert varm_layers in ad.varm.keys() + assert ad.varm[varm_layers].shape[0] == 201 From 49ced873bdc1f8123211772de78a7b629f3ad396 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Thu, 14 Mar 2024 15:06:25 -0700 Subject: [PATCH 10/14] Remove varm_layers --- .../cellxgene_census/src/cellxgene_census/_experiment.py | 1 - .../cellxgene_census/src/cellxgene_census/_get_anndata.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py index 308ee8bc6..a05a75c51 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py @@ -45,7 +45,6 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment: >>> human = get_experiment(census, "homo_sapiens") """ - # lower/snake case the organism name to find the experiment name exp_name = _get_experiment_name(organism) if exp_name not in census["census_data"]: diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 765fb857d..3fe762968 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -25,7 +25,6 @@ def get_anndata( X_name: str = "raw", X_layers: Optional[Sequence[str]] = (), obsm_layers: Optional[Sequence[str]] = (), - varm_layers: Optional[Sequence[str]] = (), obs_value_filter: Optional[str] = None, obs_coords: Optional[SparseDFCoord] = None, var_value_filter: Optional[str] = None, @@ -64,8 +63,6 @@ def get_anndata( Columns to fetch for ``obs`` and ``var`` dataframes. obsm_layers: Additional obsm layers to read and return in the ``obsm`` slot. - varm_layers: - Additional varm layers to read and return in the ``varm`` slot. add_obs_embeddings: Embeddings to be returned as part of the ``obsm`` slot. Use :func:`get_all_available_embeddings` to retrieve available embeddings @@ -102,7 +99,6 @@ def get_anndata( column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, - varm_layers=varm_layers, ) # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot From 299aeb7d0e51dce911ec129b775101b5855dc971 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 15 Mar 2024 09:40:38 -0700 Subject: [PATCH 11/14] General refactor --- .../src/cellxgene_census/_get_anndata.py | 10 +- .../experimental/_embedding.py | 98 +++++++++++-------- 2 files changed, 63 insertions(+), 45 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 3fe762968..b7c7987e3 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -13,6 +13,7 @@ from somacore.options import SparseDFCoord from ._experiment import _get_experiment, _get_experiment_name +from ._release_directory import get_census_version_directory from ._util import _extract_census_version CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma" @@ -76,7 +77,7 @@ def get_anndata( An :class:`anndata.AnnData` object containing the census slice. Lifecycle: - maturing + experimental Examples: >>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']") @@ -103,17 +104,18 @@ def get_anndata( # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot if add_obs_embeddings is not None or add_var_embeddings is not None: - from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name + from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name census_version = _extract_census_version(census) experiment_name = _get_experiment_name(organism) + census_directory = get_census_version_directory() if add_obs_embeddings is not None: obs_soma_joinids = query.obs_joinids() for emb in add_obs_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding") uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" - embedding = get_embedding(census_version, uri, obs_soma_joinids) + embedding = _get_embedding(census, census_directory, census_version, uri, obs_soma_joinids) adata.obsm[emb] = embedding if add_var_embeddings is not None: @@ -121,7 +123,7 @@ def get_anndata( for emb in add_var_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding") uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" - embedding = get_embedding(census_version, uri, var_soma_joinids) + embedding = _get_embedding(census, census_directory, census_version, uri, var_soma_joinids) adata.varm[emb] = embedding return adata diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 392b38c02..3e365ec83 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -17,7 +17,7 @@ import tiledbsoma as soma from .._open import get_default_soma_context, open_soma -from .._release_directory import get_census_version_directory +from .._release_directory import CensusVersionDescription, CensusVersionName, get_census_version_directory CELL_CENSUS_EMBEDDINGS_MANIFEST_URL = "https://contrib.cellxgene.cziscience.com/contrib/cell-census/contributions.json" @@ -50,50 +50,15 @@ def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBC return cast(Dict[str, Any], embedding_metadata) -def get_embedding( +def _get_embedding( + census: soma.Collection, + census_directory: dict[CensusVersionName, CensusVersionDescription], census_version: str, embedding_uri: str, obs_soma_joinids: npt.NDArray[np.int64] | pa.Array, context: soma.options.SOMATileDBContext | None = None, ) -> npt.NDArray[np.float32]: - """Read cell (obs) embeddings and return as a dense :class:`numpy.ndarray`. Any cells without - an embedding will return NaN values. - - Args: - census_version: - The Census version tag, e.g., ``"2023-12-15"``. Used to verify that the contents of - the embedding contain embedded cells from the same Census version. - embedding_uri: - The URI containing the embedding data. - obs_soma_joinids: - The slice of the embedding to fetch and return. - context: - A custom :class:`tiledbsoma.SOMATileDBContext` which will be used to open the SOMA object. - Optional, defaults to ``None``. - - Returns: - A :class:`numpy.ndarray` containing the embeddings. Embeddings are positionally - indexed by the ``obs_soma_joinids``. In other words, the cell identified by - ``obs_soma_joinids[i]`` corresponds to the ``ith`` position in the returned - :class:`numpy.ndarray`. - - Raises: - ValueError: if the Census and embedding are mismatched. - - Lifecycle: - experimental - - Examples: - >>> obs_somaids_to_fetch = np.array([10,11], dtype=np.int64) - >>> emb = cellxgene_census.experimental.get_embedding('2023-12-15', embedding_uri, obs_somaids_to_fetch) - >>> emb.shape - (2, 200) - >>> emb[:, 0:4] - array([[ 0.02954102, 1.0390625 , -0.14550781, -0.40820312], - [-0.00224304, 1.265625 , 0.05883789, -0.7890625 ]], - dtype=float32) - - """ + """Private. Like get_embedding, but accepts a Census object and a Census directory.""" if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): obs_soma_joinids = obs_soma_joinids.to_numpy() assert isinstance(obs_soma_joinids, np.ndarray) @@ -104,7 +69,6 @@ def get_embedding( context = context or get_default_soma_context() # Attempt to resolve census version aliases - census_directory = get_census_version_directory() resolved_census_version = census_directory.get(census_version, None) with soma.open(embedding_uri, context=context) as E: @@ -141,6 +105,58 @@ def get_embedding( return embedding +def get_embedding( + census_version: str, + embedding_uri: str, + obs_soma_joinids: npt.NDArray[np.int64] | pa.Array, + context: soma.options.SOMATileDBContext | None = None, +) -> npt.NDArray[np.float32]: + """Read cell (obs) embeddings and return as a dense :class:`numpy.ndarray`. Any cells without + an embedding will return NaN values. + + Args: + census_version: + The Census version tag, e.g., ``"2023-12-15"``. Used to verify that the contents of + the embedding contain embedded cells from the same Census version. + embedding_uri: + The URI containing the embedding data. + obs_soma_joinids: + The slice of the embedding to fetch and return. + context: + A custom :class:`tiledbsoma.SOMATileDBContext` which will be used to open the SOMA object. + Optional, defaults to ``None``. + + Returns: + A :class:`numpy.ndarray` containing the embeddings. Embeddings are positionally + indexed by the ``obs_soma_joinids``. In other words, the cell identified by + ``obs_soma_joinids[i]`` corresponds to the ``ith`` position in the returned + :class:`numpy.ndarray`. + + Raises: + ValueError: if the Census and embedding are mismatched. + + Lifecycle: + experimental + + Examples: + >>> obs_somaids_to_fetch = np.array([10,11], dtype=np.int64) + >>> emb = cellxgene_census.experimental.get_embedding('2023-12-15', embedding_uri, obs_somaids_to_fetch) + >>> emb.shape + (2, 200) + >>> emb[:, 0:4] + array([[ 0.02954102, 1.0390625 , -0.14550781, -0.40820312], + [-0.00224304, 1.265625 , 0.05883789, -0.7890625 ]], + dtype=float32) + + """ + census_directory = get_census_version_directory() + + with open_soma(census_version=census_version, context=context) as census: + return _get_embedding( + census, census_directory, census_version, embedding_uri, obs_soma_joinids, context=context + ) + + def get_embedding_metadata_by_name( embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding" ) -> dict[str, Any]: From a867985e7eb605620af3aeb3df79718829147cb5 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 15 Mar 2024 10:41:49 -0700 Subject: [PATCH 12/14] Add condition for obsm_layers --- .../src/cellxgene_census/_get_anndata.py | 15 ++++++++++----- .../cellxgene_census/experimental/_embedding.py | 2 +- .../cellxgene_census/tests/test_get_anndata.py | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index b7c7987e3..c6e2a4fad 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -65,11 +65,11 @@ def get_anndata( obsm_layers: Additional obsm layers to read and return in the ``obsm`` slot. add_obs_embeddings: - Embeddings to be returned as part of the ``obsm`` slot. + Additional embeddings to be returned as part of the ``obsm`` slot. Use :func:`get_all_available_embeddings` to retrieve available embeddings for this Census version and organism. add_var_embeddings: - Embeddings to be returned as part of the ``varm`` slot. + Additional embeddings to be returned as part of the ``varm`` slot. Use :func:`get_all_available_embeddings` to retrieve available embeddings for this Census version and organism. @@ -103,14 +103,19 @@ def get_anndata( ) # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot - if add_obs_embeddings is not None or add_var_embeddings is not None: + if add_obs_embeddings or add_var_embeddings: from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name census_version = _extract_census_version(census) experiment_name = _get_experiment_name(organism) census_directory = get_census_version_directory() - if add_obs_embeddings is not None: + if add_obs_embeddings: + if obsm_layers and [x for x in add_obs_embeddings if x in obsm_layers]: + raise ValueError( + "Cannot request both `obsm_layers` and `add_obs_embeddings` for the same embedding name" + ) + obs_soma_joinids = query.obs_joinids() for emb in add_obs_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding") @@ -118,7 +123,7 @@ def get_anndata( embedding = _get_embedding(census, census_directory, census_version, uri, obs_soma_joinids) adata.obsm[emb] = embedding - if add_var_embeddings is not None: + if add_var_embeddings: var_soma_joinids = query.var_joinids() for emb in add_var_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding") diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 3e365ec83..0bfa09a6f 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -183,7 +183,7 @@ def get_embedding_metadata_by_name( response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - manifest = cast(dict[str, dict[str, Any]], response.json()) + manifest = cast(Dict[str, Dict[str, Any]], response.json()) embeddings = [] for _, obj in manifest.items(): if ( diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index 7b8ddc54d..c3a8acef8 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -248,3 +248,19 @@ def test_get_anndata_add_var_embeddings(lts_census: soma.Collection, add_var_emb for varm_layers in add_var_embeddings: assert varm_layers in ad.varm.keys() assert ad.varm[varm_layers].shape[0] == 201 + + +@pytest.mark.live_corpus +def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Collection) -> None: + """Fails if both `obsm_layers` and `add_obs_embeddings` are specified.""" + with lts_census: + with pytest.raises(ValueError): + cellxgene_census.get_anndata( + lts_census, + organism="Homo sapiens", + X_name="raw", + obs_coords=slice(100), + var_coords=slice(200), + obsm_layers=["scvi"], + add_obs_embeddings=["scvi"], + ) From 4656d89db8aa8d4f63d6b2dc6c28219b534a072a Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Mon, 18 Mar 2024 11:41:46 -0700 Subject: [PATCH 13/14] PR comments --- .../src/cellxgene_census/_get_anndata.py | 43 ++++++++++++------- .../src/cellxgene_census/_util.py | 6 ++- .../experimental/_embedding.py | 15 +++---- .../tests/test_get_anndata.py | 20 ++++----- 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index c6e2a4fad..b84ae7269 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -26,13 +26,16 @@ def get_anndata( X_name: str = "raw", X_layers: Optional[Sequence[str]] = (), obsm_layers: Optional[Sequence[str]] = (), + obsp_layers: Optional[Sequence[str]] = (), + varm_layers: Optional[Sequence[str]] = (), + varp_layers: Optional[Sequence[str]] = (), obs_value_filter: Optional[str] = None, obs_coords: Optional[SparseDFCoord] = None, var_value_filter: Optional[str] = None, var_coords: Optional[SparseDFCoord] = None, column_names: Optional[soma.AxisColumnNames] = None, - add_obs_embeddings: Optional[Sequence[str]] = (), - add_var_embeddings: Optional[Sequence[str]] = (), + obs_embeddings: Optional[Sequence[str]] = (), + var_embeddings: Optional[Sequence[str]] = (), ) -> anndata.AnnData: """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. @@ -64,11 +67,17 @@ def get_anndata( Columns to fetch for ``obs`` and ``var`` dataframes. obsm_layers: Additional obsm layers to read and return in the ``obsm`` slot. - add_obs_embeddings: + obsp_layers: + Additional obsp layers to read and return in the ``obsp`` slot. + varm_layers: + Additional varm layers to read and return in the ``varm`` slot. + varp_layers: + Additional varp layers to read and return in the ``varp`` slot. + obs_embeddings: Additional embeddings to be returned as part of the ``obsm`` slot. Use :func:`get_all_available_embeddings` to retrieve available embeddings for this Census version and organism. - add_var_embeddings: + var_embeddings: Additional embeddings to be returned as part of the ``varm`` slot. Use :func:`get_all_available_embeddings` to retrieve available embeddings for this Census version and organism. @@ -90,6 +99,12 @@ def get_anndata( obs_coords = (slice(None),) if obs_coords is None else (obs_coords,) var_coords = (slice(None),) if var_coords is None else (var_coords,) + if obsm_layers and obs_embeddings and set(obsm_layers) & set(obs_embeddings): + raise ValueError("Cannot request both `obsm_layers` and `obs_embeddings` for the same embedding name") + + if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings): + raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name") + with exp.axis_query( measurement_name, obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords), @@ -100,32 +115,30 @@ def get_anndata( column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, + varm_layers=varm_layers, + obsp_layers=obsp_layers, + varp_layers=varp_layers, ) - # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot - if add_obs_embeddings or add_var_embeddings: + # If obs_embeddings or var_embeddings are defined, inject them in the appropriate slot + if obs_embeddings or var_embeddings: from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name census_version = _extract_census_version(census) experiment_name = _get_experiment_name(organism) census_directory = get_census_version_directory() - if add_obs_embeddings: - if obsm_layers and [x for x in add_obs_embeddings if x in obsm_layers]: - raise ValueError( - "Cannot request both `obsm_layers` and `add_obs_embeddings` for the same embedding name" - ) - + if obs_embeddings: obs_soma_joinids = query.obs_joinids() - for emb in add_obs_embeddings: + for emb in obs_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding") uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" embedding = _get_embedding(census, census_directory, census_version, uri, obs_soma_joinids) adata.obsm[emb] = embedding - if add_var_embeddings: + if var_embeddings: var_soma_joinids = query.var_joinids() - for emb in add_var_embeddings: + for emb in var_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding") uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" embedding = _get_embedding(census, census_directory, census_version, uri, var_soma_joinids) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 81992dbf7..b7f70ee2c 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -24,5 +24,9 @@ def _uri_join(base: str, url: str) -> str: def _extract_census_version(census: soma.Collection) -> str: """Extract the Census version from the given Census object.""" - version: str = urllib.parse.urlparse(census.uri).path.split("/")[2] + try: + version: str = urllib.parse.urlparse(census.uri).path.split("/")[2] + except (KeyError, IndexError): + raise ValueError("Unable to extract Census version.") from None + return version diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 0bfa09a6f..05127ec19 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -254,14 +254,13 @@ def get_all_census_versions_with_embedding( response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) response.raise_for_status() - versions = set() manifest = response.json() - for _, obj in manifest.items(): - if ( - obj["embedding_name"] == embedding_name + return sorted( + { + obj["census_version"] + for obj in manifest.values() + if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type - ): - versions.add(obj["census_version"]) - - return sorted(versions) + } + ) diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index c3a8acef8..35063d23e 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -205,8 +205,8 @@ def test_get_anndata_obsm_two_layers(lts_census: soma.Collection, obsm_layers: L @pytest.mark.live_corpus -@pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer", "uce"]]) -def test_get_anndata_add_obs_embeddings(lts_census: soma.Collection, add_obs_embeddings: List[str]) -> None: +@pytest.mark.parametrize("obs_embeddings", [["scvi", "geneformer", "uce"]]) +def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: List[str]) -> None: # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, # so this test could require adjustments. @@ -217,19 +217,19 @@ def test_get_anndata_add_obs_embeddings(lts_census: soma.Collection, add_obs_emb X_name="raw", obs_coords=slice(100), var_coords=slice(200), - add_obs_embeddings=add_obs_embeddings, + obs_embeddings=obs_embeddings, ) assert len(ad.obsm.keys()) == 3 assert len(ad.varm.keys()) == 0 - for obsm_layer in add_obs_embeddings: + for obsm_layer in obs_embeddings: assert obsm_layer in ad.obsm.keys() assert ad.obsm[obsm_layer].shape[0] == 101 @pytest.mark.live_corpus -@pytest.mark.parametrize("add_var_embeddings", [["nmf"]]) -def test_get_anndata_add_var_embeddings(lts_census: soma.Collection, add_var_embeddings: List[str]) -> None: +@pytest.mark.parametrize("var_embeddings", [["nmf"]]) +def test_get_anndata_var_embeddings(lts_census: soma.Collection, var_embeddings: List[str]) -> None: # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, # so this test could require adjustments. @@ -240,19 +240,19 @@ def test_get_anndata_add_var_embeddings(lts_census: soma.Collection, add_var_emb X_name="raw", obs_coords=slice(100), var_coords=slice(200), - add_var_embeddings=add_var_embeddings, + var_embeddings=var_embeddings, ) assert len(ad.obsm.keys()) == 0 assert len(ad.varm.keys()) == 1 - for varm_layers in add_var_embeddings: + for varm_layers in var_embeddings: assert varm_layers in ad.varm.keys() assert ad.varm[varm_layers].shape[0] == 201 @pytest.mark.live_corpus def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Collection) -> None: - """Fails if both `obsm_layers` and `add_obs_embeddings` are specified.""" + """Fails if both `obsm_layers` and `obs_embeddings` are specified.""" with lts_census: with pytest.raises(ValueError): cellxgene_census.get_anndata( @@ -262,5 +262,5 @@ def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Co obs_coords=slice(100), var_coords=slice(200), obsm_layers=["scvi"], - add_obs_embeddings=["scvi"], + obs_embeddings=["scvi"], ) From 1cbc2bcbbc451fc4e7f72f8aa283d384d10bc078 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 29 Mar 2024 10:56:48 -0700 Subject: [PATCH 14/14] Switch to urijoin --- .../cellxgene_census/src/cellxgene_census/_get_anndata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index b84ae7269..4f04e7125 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -14,9 +14,9 @@ from ._experiment import _get_experiment, _get_experiment_name from ._release_directory import get_census_version_directory -from ._util import _extract_census_version +from ._util import _extract_census_version, _uri_join -CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma" +CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma/" def get_anndata( @@ -132,7 +132,7 @@ def get_anndata( obs_soma_joinids = query.obs_joinids() for emb in obs_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding") - uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" + uri = _uri_join(CENSUS_EMBEDDINGS_LOCATION_BASE_URI, f"{census_version}/{emb_metadata['id']}") embedding = _get_embedding(census, census_directory, census_version, uri, obs_soma_joinids) adata.obsm[emb] = embedding @@ -140,7 +140,7 @@ def get_anndata( var_soma_joinids = query.var_joinids() for emb in var_embeddings: emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding") - uri = f"{CENSUS_EMBEDDINGS_LOCATION_BASE_URI}/{census_version}/{emb_metadata['id']}" + uri = _uri_join(CENSUS_EMBEDDINGS_LOCATION_BASE_URI, f"{census_version}/{emb_metadata['id']}") embedding = _get_embedding(census, census_directory, census_version, uri, var_soma_joinids) adata.varm[emb] = embedding