diff --git a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py index 4092d374a..a05a75c51 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py @@ -12,6 +12,12 @@ import tiledbsoma as soma +def _get_experiment_name(organism: str) -> str: + """Given an organism name, return the experiment name.""" + # lower/snake case the organism name to find the experiment name + return re.sub(r"[ ]+", "_", organism).lower() + + def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment: """Given a census :class:`tiledbsoma.Collection`, return the experiment for the named organism. Organism matching is somewhat flexible, attempting to map from human-friendly @@ -39,8 +45,7 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment: >>> human = get_experiment(census, "homo_sapiens") """ - # lower/snake case the organism name to find the experiment name - exp_name = re.sub(r"[ ]+", "_", organism).lower() + exp_name = _get_experiment_name(organism) if exp_name not in census["census_data"]: raise ValueError(f"Unknown organism {organism} - does not exist") diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py index 54018796f..4f04e7125 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py @@ -12,7 +12,11 @@ import tiledbsoma as soma from somacore.options import SparseDFCoord -from ._experiment import _get_experiment +from ._experiment import _get_experiment, _get_experiment_name +from ._release_directory import get_census_version_directory +from ._util import _extract_census_version, _uri_join + +CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma/" def get_anndata( @@ -22,11 +26,16 @@ def get_anndata( X_name: str = "raw", X_layers: Optional[Sequence[str]] = (), obsm_layers: Optional[Sequence[str]] = (), + obsp_layers: Optional[Sequence[str]] = (), + varm_layers: Optional[Sequence[str]] = (), + varp_layers: Optional[Sequence[str]] = (), obs_value_filter: Optional[str] = None, obs_coords: Optional[SparseDFCoord] = None, var_value_filter: Optional[str] = None, var_coords: Optional[SparseDFCoord] = None, column_names: Optional[soma.AxisColumnNames] = None, + obs_embeddings: Optional[Sequence[str]] = (), + var_embeddings: Optional[Sequence[str]] = (), ) -> anndata.AnnData: """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query, and return it as an :class:`anndata.AnnData` object. @@ -58,12 +67,26 @@ def get_anndata( Columns to fetch for ``obs`` and ``var`` dataframes. obsm_layers: Additional obsm layers to read and return in the ``obsm`` slot. + obsp_layers: + Additional obsp layers to read and return in the ``obsp`` slot. + varm_layers: + Additional varm layers to read and return in the ``varm`` slot. + varp_layers: + Additional varp layers to read and return in the ``varp`` slot. + obs_embeddings: + Additional embeddings to be returned as part of the ``obsm`` slot. + Use :func:`get_all_available_embeddings` to retrieve available embeddings + for this Census version and organism. + var_embeddings: + Additional embeddings to be returned as part of the ``varm`` slot. + Use :func:`get_all_available_embeddings` to retrieve available embeddings + for this Census version and organism. Returns: An :class:`anndata.AnnData` object containing the census slice. Lifecycle: - maturing + experimental Examples: >>> get_anndata(census, "Mus musculus", obs_value_filter="tissue_general in ['brain', 'lung']") @@ -75,14 +98,50 @@ def get_anndata( exp = _get_experiment(census, organism) obs_coords = (slice(None),) if obs_coords is None else (obs_coords,) var_coords = (slice(None),) if var_coords is None else (var_coords,) + + if obsm_layers and obs_embeddings and set(obsm_layers) & set(obs_embeddings): + raise ValueError("Cannot request both `obsm_layers` and `obs_embeddings` for the same embedding name") + + if varm_layers and var_embeddings and set(varm_layers) & set(var_embeddings): + raise ValueError("Cannot request both `varm_layers` and `var_embeddings` for the same embedding name") + with exp.axis_query( measurement_name, obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords), var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords), ) as query: - return query.to_anndata( + adata = query.to_anndata( X_name=X_name, column_names=column_names, X_layers=X_layers, obsm_layers=obsm_layers, + varm_layers=varm_layers, + obsp_layers=obsp_layers, + varp_layers=varp_layers, ) + + # If obs_embeddings or var_embeddings are defined, inject them in the appropriate slot + if obs_embeddings or var_embeddings: + from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name + + census_version = _extract_census_version(census) + experiment_name = _get_experiment_name(organism) + census_directory = get_census_version_directory() + + if obs_embeddings: + obs_soma_joinids = query.obs_joinids() + for emb in obs_embeddings: + emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "obs_embedding") + uri = _uri_join(CENSUS_EMBEDDINGS_LOCATION_BASE_URI, f"{census_version}/{emb_metadata['id']}") + embedding = _get_embedding(census, census_directory, census_version, uri, obs_soma_joinids) + adata.obsm[emb] = embedding + + if var_embeddings: + var_soma_joinids = query.var_joinids() + for emb in var_embeddings: + emb_metadata = get_embedding_metadata_by_name(emb, experiment_name, census_version, "var_embedding") + uri = _uri_join(CENSUS_EMBEDDINGS_LOCATION_BASE_URI, f"{census_version}/{emb_metadata['id']}") + embedding = _get_embedding(census, census_directory, census_version, uri, var_soma_joinids) + adata.varm[emb] = embedding + + return adata diff --git a/api/python/cellxgene_census/src/cellxgene_census/_util.py b/api/python/cellxgene_census/src/cellxgene_census/_util.py index 8b7e5685b..b7f70ee2c 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/_util.py +++ b/api/python/cellxgene_census/src/cellxgene_census/_util.py @@ -1,5 +1,7 @@ import urllib.parse +import tiledbsoma as soma + def _uri_join(base: str, url: str) -> str: """Like urllib.parse.urljoin, but doesn't get confused by s3://.""" @@ -18,3 +20,13 @@ def _uri_join(base: str, url: str) -> str: p_url.fragment, ] return urllib.parse.urlunparse(parts) + + +def _extract_census_version(census: soma.Collection) -> str: + """Extract the Census version from the given Census object.""" + try: + version: str = urllib.parse.urlparse(census.uri).path.split("/")[2] + except (KeyError, IndexError): + raise ValueError("Unable to extract Census version.") from None + + return version diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py index c37c08789..4a65cb3ab 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/__init__.py @@ -1,8 +1,17 @@ """Experimental API for the CELLxGENE Discover Census.""" -from ._embedding import get_embedding, get_embedding_metadata +from ._embedding import ( + get_all_available_embeddings, + get_all_census_versions_with_embedding, + get_embedding, + get_embedding_metadata, + get_embedding_metadata_by_name, +) __all__ = [ "get_embedding", "get_embedding_metadata", + "get_embedding_metadata_by_name", + "get_all_available_embeddings", + "get_all_census_versions_with_embedding", ] diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py index 489efee24..3926c7106 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/_embedding.py @@ -13,10 +13,13 @@ import numpy.typing as npt import pandas as pd import pyarrow as pa +import requests import tiledbsoma as soma from .._open import get_default_soma_context, open_soma -from .._release_directory import get_census_version_directory +from .._release_directory import CensusVersionDescription, CensusVersionName, get_census_version_directory + +CELL_CENSUS_EMBEDDINGS_MANIFEST_URL = "https://contrib.cellxgene.cziscience.com/contrib/cell-census/contributions.json" def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBContext | None = None) -> dict[str, Any]: @@ -47,6 +50,61 @@ def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBC return cast(Dict[str, Any], embedding_metadata) +def _get_embedding( + census: soma.Collection, + census_directory: dict[CensusVersionName, CensusVersionDescription], + census_version: str, + embedding_uri: str, + obs_soma_joinids: npt.NDArray[np.int64] | pa.Array, + context: soma.options.SOMATileDBContext | None = None, +) -> npt.NDArray[np.float32]: + """Private. Like get_embedding, but accepts a Census object and a Census directory.""" + if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): + obs_soma_joinids = obs_soma_joinids.to_numpy() + assert isinstance(obs_soma_joinids, np.ndarray) + if obs_soma_joinids.dtype != np.int64: + raise TypeError("obs_soma_joinids must be array of int64") + + # Allow the user to override context for exceptional cases (e.g. the aws region) + context = context or get_default_soma_context() + + # Attempt to resolve census version aliases + resolved_census_version = census_directory.get(census_version, None) + + with soma.open(embedding_uri, context=context) as E: + embedding_metadata = json.loads(E.metadata["CxG_embedding_info"]) + + if resolved_census_version is None: + warnings.warn( + "Unable to determine Census version - skipping validation of Census and embedding version.", + stacklevel=1, + ) + elif resolved_census_version != census_directory.get(embedding_metadata["census_version"], None): + raise ValueError("Census and embedding mismatch - census_version not equal") + + with open_soma(census_version=census_version, context=context) as census: + experiment_name = embedding_metadata["experiment_name"] + if experiment_name not in census["census_data"]: + raise ValueError("Census and embedding mismatch - experiment_name does not exist") + measurement_name = embedding_metadata["measurement_name"] + if measurement_name not in census["census_data"][experiment_name].ms: + raise ValueError("Census and embedding mismatch - measurement_name does not exist") + + embedding_shape = (len(obs_soma_joinids), E.shape[1]) + embedding = np.full(embedding_shape, np.NaN, dtype=np.float32, order="C") + + obs_indexer = soma.IntIndexer(obs_soma_joinids, context=E.context) + for tbl in E.read(coords=(obs_soma_joinids,)).tables(): + obs_idx = obs_indexer.get_indexer(tbl.column("soma_dim_0").to_numpy()) + feat_idx = tbl.column("soma_dim_1").to_numpy() + emb = tbl.column("soma_data") + + indices = obs_idx * E.shape[1] + feat_idx + np.put(embedding.reshape(-1), indices, emb) + + return embedding + + def get_embedding( census_version: str, embedding_uri: str, @@ -91,48 +149,118 @@ def get_embedding( dtype=float32) """ - if isinstance(obs_soma_joinids, (pa.Array, pa.ChunkedArray, pd.Series)): - obs_soma_joinids = obs_soma_joinids.to_numpy() - assert isinstance(obs_soma_joinids, np.ndarray) - if obs_soma_joinids.dtype != np.int64: - raise TypeError("obs_soma_joinids must be array of int64") + census_directory = get_census_version_directory() - # Allow the user to override context for exceptional cases (e.g. the aws region) - context = context or get_default_soma_context() + with open_soma(census_version=census_version, context=context) as census: + return _get_embedding( + census, census_directory, census_version, embedding_uri, obs_soma_joinids, context=context + ) - # Attempt to resolve census version aliases - census_directory = get_census_version_directory() - resolved_census_version = census_directory.get(census_version, None) - with soma.open(embedding_uri, context=context) as E: - embedding_metadata = json.loads(E.metadata["CxG_embedding_info"]) +def get_embedding_metadata_by_name( + embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding" +) -> dict[str, Any]: + """Return metadata for a specific embedding. If more embeddings match the query parameters, + the most recent one will be returned. - if resolved_census_version is None: - warnings.warn( - "Unable to determine Census version - skipping validation of Census and embedding version.", - stacklevel=1, - ) - elif resolved_census_version != census_directory.get(embedding_metadata["census_version"], None): - raise ValueError("Census and embedding mismatch - census_version not equal") + Args: + embedding_name: + The name of the embedding, e.g. "scvi". + organism: + The organism for which the embedding is associated. + census_version: + The Census version tag, e.g., ``"2023-12-15"``. + embedding_type: + Either "obs_embedding" or "var_embedding". Defaults to "obs_embedding". - with open_soma(census_version=census_version, context=context) as census: - experiment_name = embedding_metadata["experiment_name"] - if experiment_name not in census["census_data"]: - raise ValueError("Census and embedding mismatch - experiment_name does not exist") - measurement_name = embedding_metadata["measurement_name"] - if measurement_name not in census["census_data"][experiment_name].ms: - raise ValueError("Census and embedding mismatch - measurement_name does not exist") + Returns: + A dictionary containing metadata describing the embedding. - embedding_shape = (len(obs_soma_joinids), E.shape[1]) - embedding = np.full(embedding_shape, np.NaN, dtype=np.float32, order="C") + Raises: + ValueError: if no embeddings are found for the specified query parameters. - obs_indexer = soma.IntIndexer(obs_soma_joinids, context=E.context) - for tbl in E.read(coords=(obs_soma_joinids,)).tables(): - obs_idx = obs_indexer.get_indexer(tbl.column("soma_dim_0").to_numpy()) - feat_idx = tbl.column("soma_dim_1").to_numpy() - emb = tbl.column("soma_data") + """ + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() - indices = obs_idx * E.shape[1] + feat_idx - np.put(embedding.reshape(-1), indices, emb) + manifest = cast(Dict[str, Dict[str, Any]], response.json()) + embeddings = [] + for _, obj in manifest.items(): + if ( + obj["embedding_name"] == embedding_name + and obj["experiment_name"] == organism + and obj["data_type"] == embedding_type + and obj["census_version"] == census_version + ): + embeddings.append(obj) - return embedding + if len(embeddings) == 0: + raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}") + + return sorted(embeddings, key=lambda x: x["submission_date"])[-1] + + +def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]: + """Return a dictionary of all available embeddings for a given Census version. + + Args: + census_version: + The Census version tag, e.g., ``"2023-12-15"``. + + Returns: + A list of dictionaries, each containing metadata describing an available embedding. + + Examples: + >>> get_all_available_embeddings('2023-12-15') + [{ + 'experiment_name': 'experiment_1', + 'measurement_name': 'RNA', + 'organism': "homo_sapiens", + 'census_version': '2023-12-15', + 'n_embeddings': 1000, + 'n_features': 200, + 'uri': 's3://bucket/embedding_1' + }] + + """ + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + embeddings = [] + manifest = response.json() + for _, obj in manifest.items(): + if obj["census_version"] == census_version: + embeddings.append(obj) + + return embeddings + + +def get_all_census_versions_with_embedding( + embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding" +) -> list[str]: + """Get a list of all census versions that contain a specific embedding. + + Args: + embedding_name: + The name of the embedding, e.g. "scvi". + organism: + The organism for which the embedding is associated. + embedding_type: + The type of embedding. Defaults to "obs_embedding". + + Returns: + A list of census versions that contain the specified embedding. + """ + response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL) + response.raise_for_status() + + manifest = response.json() + return sorted( + { + obj["census_version"] + for obj in manifest.values() + if obj["embedding_name"] == embedding_name + and obj["experiment_name"] == organism + and obj["data_type"] == embedding_type + } + ) diff --git a/api/python/cellxgene_census/tests/experimental/test_embeddings.py b/api/python/cellxgene_census/tests/experimental/test_embeddings.py new file mode 100644 index 000000000..aeb0ff661 --- /dev/null +++ b/api/python/cellxgene_census/tests/experimental/test_embeddings.py @@ -0,0 +1,177 @@ +import pytest +import requests_mock as rm + +from cellxgene_census.experimental import ( + get_all_available_embeddings, + get_all_census_versions_with_embedding, + get_embedding_metadata_by_name, +) +from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL + + +def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-11-15", + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_1", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-12-31", + }, + "embedding-id-3": { + "id": "embedding-id-3", + "embedding_name": "emb_3", + "title": "Embedding 3", + "description": "Third embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + "submission_date": "2023-11-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + embedding = get_embedding_metadata_by_name( + "emb_1", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) + assert embedding is not None + assert embedding["id"] == "embedding-id-2" # most recent version + assert embedding == mock_embeddings["embedding-id-2"] + + embedding = get_embedding_metadata_by_name( + "emb_3", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) + assert embedding is not None + assert embedding["id"] == "embedding-id-3" + assert embedding == mock_embeddings["embedding-id-3"] + + with pytest.raises(ValueError): + get_embedding_metadata_by_name( + "emb_2", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="homo_sapiens", census_version="2023-10-15", embedding_type="obs_embedding" + ) + get_embedding_metadata_by_name( + "emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="var_embedding" + ) + + +def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "measurement_name": "RNA", + "n_embeddings": 1000, + "n_features": 200, + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_2", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "measurement_name": "RNA", + "n_embeddings": 1000, + "n_features": 200, + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + embeddings = get_all_available_embeddings("2023-12-15") + assert embeddings is not None + assert len(embeddings) == 2 + + # Query for a non existing version of the Census + embeddings = get_all_available_embeddings("2024-12-15") + assert len(embeddings) == 0 + + +def test_get_all_census_versions_with_embedding(requests_mock: rm.Mocker) -> None: + mock_embeddings = { + "embedding-id-1": { + "id": "embedding-id-1", + "embedding_name": "emb_1", + "title": "Embedding 1", + "description": "First embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-2": { + "id": "embedding-id-2", + "embedding_name": "emb_1", + "title": "Embedding 2", + "description": "Second embedding", + "experiment_name": "homo_sapiens", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-3": { + "id": "embedding-id-3", + "embedding_name": "emb_1", + "title": "Embedding 3", + "description": "Third embedding", + "experiment_name": "mus_musculus", + "data_type": "obs_embedding", + "census_version": "2023-12-15", + }, + "embedding-id-4": { + "id": "embedding-id-4", + "embedding_name": "emb_1", + "title": "Embedding 4", + "description": "Fourth embedding", + "experiment_name": "mus_musculus", + "data_type": "obs_embedding", + "census_version": "2024-01-01", + }, + "embedding-id-5": { + "id": "embedding-id-5", + "embedding_name": "emb_2", + "title": "Embedding 5", + "description": "Fifth embedding", + "experiment_name": "mus_musculus", + "data_type": "var_embedding", + "census_version": "2023-12-15", + }, + } + requests_mock.real_http = True + requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings) + + versions = get_all_census_versions_with_embedding("emb_1", organism="homo_sapiens", embedding_type="obs_embedding") + assert versions == ["2023-12-15"] + + versions = get_all_census_versions_with_embedding("emb_1", organism="mus_musculus", embedding_type="obs_embedding") + assert versions == ["2023-12-15", "2024-01-01"] + + versions = get_all_census_versions_with_embedding("emb_1", organism="mus_musculus", embedding_type="var_embedding") + assert versions == [] + + versions = get_all_census_versions_with_embedding("emb_2", organism="mus_musculus", embedding_type="var_embedding") + assert versions == ["2023-12-15"] diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py index 9c079cbb7..35063d23e 100644 --- a/api/python/cellxgene_census/tests/test_get_anndata.py +++ b/api/python/cellxgene_census/tests/test_get_anndata.py @@ -12,6 +12,11 @@ def census() -> soma.Collection: return cellxgene_census.open_soma(census_version="latest") +@pytest.fixture +def lts_census() -> soma.Collection: + return cellxgene_census.open_soma(census_version="stable") + + @pytest.mark.live_corpus def test_get_anndata_value_filter(census: soma.Collection) -> None: with census: @@ -158,13 +163,14 @@ def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None: assert raise_info.value.args[0] == "Unknown X layer name" -@pytest.mark.skip(reason="Enable when obsm is available in a live Census distribution.") @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layer", ["scvi", "geneformer"]) -def test_get_anndata_obsm_one_layer(census: soma.Collection, obsm_layer: str) -> None: - with census: +def test_get_anndata_obsm_one_layer(lts_census: soma.Collection, obsm_layer: str) -> None: + # NOTE: this test will break after next LTS release (>2023-12-15), since scvi and geneformer + # won't be distributed as part of `obsm_layers` anymore. Delete this test when it happens. + with lts_census: ad = cellxgene_census.get_anndata( - census, + lts_census, organism="Homo sapiens", X_name="raw", obs_coords=slice(100), @@ -174,16 +180,17 @@ def test_get_anndata_obsm_one_layer(census: soma.Collection, obsm_layer: str) -> assert len(ad.obsm.keys()) == 1 assert obsm_layer in ad.obsm.keys() - assert ad.obsm[obsm_layer].shape[0] == 100 + assert ad.obsm[obsm_layer].shape[0] == 101 -@pytest.mark.skip(reason="Enable when obsm is available in a live Census distribution.") @pytest.mark.live_corpus @pytest.mark.parametrize("obsm_layers", [["scvi", "geneformer"]]) -def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[str]) -> None: - with census: +def test_get_anndata_obsm_two_layers(lts_census: soma.Collection, obsm_layers: List[str]) -> None: + # NOTE: this test will break after next LTS release (>2023-12-15), since scvi and geneformer + # won't be distributed as part of `obsm_layers` anymore. Delete this test when it happens. + with lts_census: ad = cellxgene_census.get_anndata( - census, + lts_census, organism="Homo sapiens", X_name="raw", obs_coords=slice(100), @@ -194,4 +201,66 @@ def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[ assert len(ad.obsm.keys()) == 2 for obsm_layer in obsm_layers: assert obsm_layer in ad.obsm.keys() - assert ad.obsm[obsm_layer].shape[0] == 100 + assert ad.obsm[obsm_layer].shape[0] == 101 + + +@pytest.mark.live_corpus +@pytest.mark.parametrize("obs_embeddings", [["scvi", "geneformer", "uce"]]) +def test_get_anndata_obs_embeddings(lts_census: soma.Collection, obs_embeddings: List[str]) -> None: + # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, + # so this test could require adjustments. + + with lts_census: + ad = cellxgene_census.get_anndata( + lts_census, + organism="Homo sapiens", + X_name="raw", + obs_coords=slice(100), + var_coords=slice(200), + obs_embeddings=obs_embeddings, + ) + + assert len(ad.obsm.keys()) == 3 + assert len(ad.varm.keys()) == 0 + for obsm_layer in obs_embeddings: + assert obsm_layer in ad.obsm.keys() + assert ad.obsm[obsm_layer].shape[0] == 101 + + +@pytest.mark.live_corpus +@pytest.mark.parametrize("var_embeddings", [["nmf"]]) +def test_get_anndata_var_embeddings(lts_census: soma.Collection, var_embeddings: List[str]) -> None: + # NOTE: when the next LTS gets released (>2023-12-15), embeddings may or may not be available, + # so this test could require adjustments. + + with lts_census: + ad = cellxgene_census.get_anndata( + lts_census, + organism="Homo sapiens", + X_name="raw", + obs_coords=slice(100), + var_coords=slice(200), + var_embeddings=var_embeddings, + ) + + assert len(ad.obsm.keys()) == 0 + assert len(ad.varm.keys()) == 1 + for varm_layers in var_embeddings: + assert varm_layers in ad.varm.keys() + assert ad.varm[varm_layers].shape[0] == 201 + + +@pytest.mark.live_corpus +def test_get_anndata_obsm_layers_and_add_obs_embedding_fails(lts_census: soma.Collection) -> None: + """Fails if both `obsm_layers` and `obs_embeddings` are specified.""" + with lts_census: + with pytest.raises(ValueError): + cellxgene_census.get_anndata( + lts_census, + organism="Homo sapiens", + X_name="raw", + obs_coords=slice(100), + var_coords=slice(200), + obsm_layers=["scvi"], + obs_embeddings=["scvi"], + ) diff --git a/api/python/cellxgene_census/tests/test_util.py b/api/python/cellxgene_census/tests/test_util.py index d7f08902b..3e5a82d68 100644 --- a/api/python/cellxgene_census/tests/test_util.py +++ b/api/python/cellxgene_census/tests/test_util.py @@ -1,4 +1,9 @@ -from cellxgene_census._util import _uri_join +import re + +import pytest + +import cellxgene_census +from cellxgene_census._util import _extract_census_version, _uri_join def test_uri_join() -> None: @@ -19,3 +24,20 @@ def test_uri_join() -> None: assert _uri_join("file:///foo/bar", "a") == "file:///foo/a" assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b" + + +@pytest.mark.live_corpus +def test_extract_census_version() -> None: + """Ensures that extracting the Census version from a Collection object does not break""" + + pattern = r"^\d{4}-\d{2}-\d{2}$" + + with cellxgene_census.open_soma(census_version="stable") as census: + assert census is not None + version = _extract_census_version(census) + assert re.match(pattern, version) + + with cellxgene_census.open_soma(census_version="latest") as census: + assert census is not None + version = _extract_census_version(census) + assert re.match(pattern, version) diff --git a/tools/census_contrib/embedding_metadata.md b/tools/census_contrib/embedding_metadata.md index 0a8a7a141..b909e140a 100644 --- a/tools/census_contrib/embedding_metadata.md +++ b/tools/census_contrib/embedding_metadata.md @@ -21,6 +21,7 @@ Each embedding will contain a variety of metadata stored in the SOMA `metadata` | Field name | Required | Type | Description | | ---------------------- | -------- | ------------- | ------------------------------------------------------------------------------------- | | id | required | string | CZI-assigned accession ID for this embedding | +| embedding_name | required | string | Name of the algorithm used to generate the embedding | | title | required | string | Brief project title | | description | required | string | Succinct description of the method and characteristics of the embeddings and model | | primary_contact | required | Contact | Primary contact person for these embeddings. | @@ -49,6 +50,7 @@ For example: ```json { "id": "CxG-contrib-99999", + "embedding_name": "model_algo", "title": "An embedding", "description": "Longer description of the embedding and method used to generate it", "primary_contact": { diff --git a/tools/census_contrib/src/census_contrib/metadata.py b/tools/census_contrib/src/census_contrib/metadata.py index f4f283ead..486e49c23 100644 --- a/tools/census_contrib/src/census_contrib/metadata.py +++ b/tools/census_contrib/src/census_contrib/metadata.py @@ -34,6 +34,7 @@ class Contact: @attrs.define(kw_only=True, frozen=True) class EmbeddingMetadata: id: str = field(validator=validators.instance_of(str)) + embedding_name: str = field(validator=validators.instance_of(str)) title: str = field(validator=validators.instance_of(str)) description: str = field(validator=validators.instance_of(str)) primary_contact: Contact = field(validator=validators.instance_of(Contact)) @@ -118,6 +119,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding 4. All supplied URLs must resolve 5. Title must have length < 128 characters 6. Description must have length < 2048 characters + 7. Name must have length < 128 characters """ if not metadata.id: raise ValueError("metadata is missing 'id' (accession)") @@ -140,6 +142,13 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding "Metadata: description must be string between 1 and {MAX_DESCRIPTION_LENGTH} characters in length", ) + # 7. Name must have length < 128 characters + MAX_NAME_LENGTH = 128 + if not metadata.embedding_name or len(metadata.embedding_name) > MAX_NAME_LENGTH: + raise ValueError( + f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length", + ) + return metadata