diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 6ce220807..d8a2d7515 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,6 +12,8 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord +from util import _extract_census_version + from ._experiment import _get_experiment @@ -88,14 +90,23 @@ def get_anndata( exp = _get_experiment(census, organism) obs_coords = (slice(None),) if obs_coords is None else (obs_coords,) var_coords = (slice(None),) if var_coords is None else (var_coords,) + with exp.axis_query( measurement_name, obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords), var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords), ) as query: - return query.to_anndata( + adata = query.to_anndata( X_name=X_name, column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, + varm_layers=varm_layers ) + + # If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot + if add_obs_embeddings or add_var_embeddings: + from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name + census_version = _extract_census_version(census) + get_embedding_metadata_by_name() + diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 8b7e5685b..93b8c6caa 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -1,4 +1,5 @@ import urllib.parse +import tiledbsoma as soma def _uri_join(base: str, url: str) -> str: @@ -18,3 +19,7 @@ def _uri_join(base: str, url: str) -> str: p_url.fragment, ] return urllib.parse.urlunparse(parts) + +def _extract_census_version(census: soma.Collection): + """Extract the Census version from the given Census object.""" + return urllib.parse.urlparse(census.uri).path.split("/")[2] \ No newline at end of file diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py index c37c08789..e09759bc2 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py @@ -1,8 +1,11 @@ """Experimental API for the CELLxGENE Discover Census.""" -from ._embedding import get_embedding, get_embedding_metadata +from ._embedding import get_embedding, get_embedding_metadata, get_embedding_metadata_by_name, get_all_available_embeddings, get_all_census_versions_with_embedding __all__ = [ "get_embedding", "get_embedding_metadata", + "get_embedding_metadata_by_name", + "get_all_available_embeddings", + "get_all_census_versions_with_embedding", ] diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 4ed53bfe4..6c29b32f7 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -14,10 +14,13 @@ import pandas as pd import pyarrow as pa import tiledbsoma as soma +import requests from .._open import get_default_soma_context, open_soma from .._release_directory import get_census_version_directory +CELL_CENSUS_EMBEDDINGS_MANIFEST_URL = "https://contrib.cellxgene.cziscience.com/contrib/cell-census/contributions.json" + def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBContext | None = None) -> dict[str, Any]: """Read embedding metadata and return as a Python dict. @@ -137,6 +140,40 @@ def get_embedding( return embedding +def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding") -> dict[str, Any]: + """Return metadata for a specific embedding. If more embeddings match the query parameters, + the most recent one will be returned. + + Args: + embedding_name: + The name of the embedding, e.g. "scvi". + organism: + The organism for which the embedding is associated. + census_version: + The Census version tag, e.g., ``"2023-12-15"``. + embedding_type: + Either "obs_embedding" or "var_embedding". Defaults to "obs_embedding". + + Returns: + A dictionary containing metadata describing the embedding. + + Raises: + ValueError: if no embeddings are found for the specified query parameters. + + """ + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + manifest = response.json() + embeddings = [] + for _, obj in manifest.items(): + if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type and obj["census_version"] == census_version: + embeddings.append(obj) + + if len(embeddings) == 0: + raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}") + + return sorted(embeddings, key=lambda x: x["submission_date"])[-1] def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: """Return a dictionary of all available embeddings for a given Census version. @@ -151,33 +188,49 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: Examples: >>> get_all_available_embeddings('2023-12-15') [{ - 'experiment_name': 'experiment_1', - 'measurement_name': 'RNA', - 'organism': "homo_sapiens", - 'census_version': '2023-12-15', - 'n_embeddings': 1000, - 'n_features': 200, + 'experiment_name': 'experiment_1', + 'measurement_name': 'RNA', + 'organism': "homo_sapiens", + 'census_version': '2023-12-15', + 'n_embeddings': 1000, + 'n_features': 200, 'uri': 's3://bucket/embedding_1' }] """ - pass - - -def get_all_census_versions_with_embedding( - embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding" -) -> list[str]: - """Get a list of all census versions that contain a specific embedding. + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + embeddings = [] + manifest = response.json() + for _, obj in manifest.items(): + if obj["census_version"] == census_version: + embeddings.append(obj) + + return embeddings + +def get_all_census_versions_with_embedding(embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding") -> list[str]: + """ + Get a list of all census versions that contain a specific embedding. Args: - embedding_name: - The name of the embedding. - organism: + embedding_name: + The name of the embedding, e.g. "scvi". + organism: The organism for which the embedding is associated. - embedding_type: + embedding_type: The type of embedding. Defaults to "obs_embedding". Returns: A list of census versions that contain the specified embedding. """ - pass + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + versions = set() + manifest = response.json() + for _, obj in manifest.items(): + if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type: + versions.add(obj["census_version"]) + + return sorted(list(versions)) \ No newline at end of file diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py index 1ed1e3dde..331f5a202 100644 --- a/api/python/cellxgene_census/tests/experimental/test_embeddings.py +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py @@ -1,10 +1,65 @@ import pytest import requests_mock as rm -from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding +from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding, get_embedding_metadata_by_name +from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL + + +def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-11-15" + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_1", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-12-31", + }, + "embedding-id-3": { + "id": "embedding-id-3", + "embedding_name": "emb_3", + "title": "Embedding 3", + "description": "Third embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-11-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + embedding = get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + assert embedding is not None + assert embedding["id"] == "embedding-id-2" # most recent version + assert embedding == mock_embeddings["embedding-id-2"] + + embedding = get_embedding_metadata_by_name("emb_3", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + assert embedding is not None + assert embedding["id"] == "embedding-id-3" + assert embedding == mock_embeddings["embedding-id-3"] + + with pytest.raises(ValueError): + get_embedding_metadata_by_name("emb_2", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding") + get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "obs_embedding") + get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-10-15", embedding_type = "obs_embedding") + get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "var_embedding") + + -@pytest.mark.live_corpus def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: mock_embeddings = { "embedding-id-1": { @@ -44,7 +99,6 @@ def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: assert len(embeddings) == 0 -@pytest.mark.live_corpus def test_get_all_census_versions_with_embedding(requests_mock: rm.Mocker) -> None: mock_embeddings = { "embedding-id-1": { diff --git a/api/python/cellxgene_census/tests/test_util.py b/api/python/cellxgene_census/tests/test_util.py index d7f08902b..a9e6162df 100644 --- a/api/python/cellxgene_census/tests/test_util.py +++ b/api/python/cellxgene_census/tests/test_util.py @@ -1,4 +1,7 @@ -from cellxgene_census._util import _uri_join +from cellxgene_census._util import _uri_join, _extract_census_version +import cellxgene_census +import pytest +import re def test_uri_join() -> None: @@ -19,3 +22,19 @@ def test_uri_join() -> None: assert _uri_join("file:///foo/bar", "a") == "file:///foo/a" assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b" + +@pytest.mark.live_corpus +def test_extract_census_version() -> None: + """Ensures that extracting the Census version from a Collection object does not break""" + + pattern = r'^\d{4}-\d{2}-\d{2}$' + + with cellxgene_census.open_soma(census_version="stable") as census: + assert census is not None + version = _extract_census_version(census) + assert re.match(pattern, version) + + with cellxgene_census.open_soma(census_version="latest") as census: + assert census is not None + version = _extract_census_version(census) + assert re.match(pattern, version) \ No newline at end of file