Skip to content

Commit

Permalink
Checkpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi committed Mar 1, 2024
1 parent b0472e3 commit ddafdf0
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 24 deletions.
13 changes: 12 additions & 1 deletion api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import tiledbsoma as soma
from somacore.options import SparseDFCoord

from util import _extract_census_version

from ._experiment import _get_experiment


Expand Down Expand Up @@ -88,14 +90,23 @@ def get_anndata(
exp = _get_experiment(census, organism)
obs_coords = (slice(None),) if obs_coords is None else (obs_coords,)
var_coords = (slice(None),) if var_coords is None else (var_coords,)

with exp.axis_query(
measurement_name,
obs_query=soma.AxisQuery(value_filter=obs_value_filter, coords=obs_coords),
var_query=soma.AxisQuery(value_filter=var_value_filter, coords=var_coords),
) as query:
return query.to_anndata(
adata = query.to_anndata(
X_name=X_name,
column_names=column_names,
X_layers=X_layers,
obsm_layers=obsm_layers,
varm_layers=varm_layers
)

# If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot
if add_obs_embeddings or add_var_embeddings:
from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name
census_version = _extract_census_version(census)
get_embedding_metadata_by_name()

5 changes: 5 additions & 0 deletions api/python/cellxgene_census/src/cellxgene_census/_util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import urllib.parse
import tiledbsoma as soma


def _uri_join(base: str, url: str) -> str:
Expand All @@ -18,3 +19,7 @@ def _uri_join(base: str, url: str) -> str:
p_url.fragment,
]
return urllib.parse.urlunparse(parts)

def _extract_census_version(census: soma.Collection):
"""Extract the Census version from the given Census object."""
return urllib.parse.urlparse(census.uri).path.split("/")[2]
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Experimental API for the CELLxGENE Discover Census."""

from ._embedding import get_embedding, get_embedding_metadata
from ._embedding import get_embedding, get_embedding_metadata, get_embedding_metadata_by_name, get_all_available_embeddings, get_all_census_versions_with_embedding

__all__ = [
"get_embedding",
"get_embedding_metadata",
"get_embedding_metadata_by_name",
"get_all_available_embeddings",
"get_all_census_versions_with_embedding",
]
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
import pandas as pd
import pyarrow as pa
import tiledbsoma as soma
import requests

from .._open import get_default_soma_context, open_soma
from .._release_directory import get_census_version_directory

CELL_CENSUS_EMBEDDINGS_MANIFEST_URL = "https://contrib.cellxgene.cziscience.com/contrib/cell-census/contributions.json"


def get_embedding_metadata(embedding_uri: str, context: soma.options.SOMATileDBContext | None = None) -> dict[str, Any]:
"""Read embedding metadata and return as a Python dict.
Expand Down Expand Up @@ -137,6 +140,40 @@ def get_embedding(

return embedding

def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding") -> dict[str, Any]:
"""Return metadata for a specific embedding. If more embeddings match the query parameters,
the most recent one will be returned.
Args:
embedding_name:
The name of the embedding, e.g. "scvi".
organism:
The organism for which the embedding is associated.
census_version:
The Census version tag, e.g., ``"2023-12-15"``.
embedding_type:
Either "obs_embedding" or "var_embedding". Defaults to "obs_embedding".
Returns:
A dictionary containing metadata describing the embedding.
Raises:
ValueError: if no embeddings are found for the specified query parameters.
"""
response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
response.raise_for_status()

manifest = response.json()
embeddings = []
for _, obj in manifest.items():
if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type and obj["census_version"] == census_version:
embeddings.append(obj)

if len(embeddings) == 0:
raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}")

return sorted(embeddings, key=lambda x: x["submission_date"])[-1]

def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:
"""Return a dictionary of all available embeddings for a given Census version.
Expand All @@ -151,33 +188,49 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:
Examples:
>>> get_all_available_embeddings('2023-12-15')
[{
'experiment_name': 'experiment_1',
'measurement_name': 'RNA',
'organism': "homo_sapiens",
'census_version': '2023-12-15',
'n_embeddings': 1000,
'n_features': 200,
'experiment_name': 'experiment_1',
'measurement_name': 'RNA',
'organism': "homo_sapiens",
'census_version': '2023-12-15',
'n_embeddings': 1000,
'n_features': 200,
'uri': 's3://bucket/embedding_1'
}]
"""
pass


def get_all_census_versions_with_embedding(
embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding"
) -> list[str]:
"""Get a list of all census versions that contain a specific embedding.
response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
response.raise_for_status()

embeddings = []
manifest = response.json()
for _, obj in manifest.items():
if obj["census_version"] == census_version:
embeddings.append(obj)

return embeddings

def get_all_census_versions_with_embedding(embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding") -> list[str]:
"""
Get a list of all census versions that contain a specific embedding.
Args:
embedding_name:
The name of the embedding.
organism:
embedding_name:
The name of the embedding, e.g. "scvi".
organism:
The organism for which the embedding is associated.
embedding_type:
embedding_type:
The type of embedding. Defaults to "obs_embedding".
Returns:
A list of census versions that contain the specified embedding.
"""
pass
response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
response.raise_for_status()

versions = set()
manifest = response.json()
for _, obj in manifest.items():
if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type:
versions.add(obj["census_version"])

return sorted(list(versions))
60 changes: 57 additions & 3 deletions api/python/cellxgene_census/tests/experimental/test_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,65 @@
import pytest
import requests_mock as rm

from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding
from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding, get_embedding_metadata_by_name

from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL


def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None:
mock_embeddings = {
"embedding-id-1": {
"id": "embedding-id-1",
"embedding_name": "emb_1",
"title": "Embedding 1",
"description": "First embedding",
"experiment_name": "homo_sapiens",
"data_type": "obs_embedding",
"census_version": "2023-12-15",
"submission_date": "2023-11-15"
},
"embedding-id-2": {
"id": "embedding-id-2",
"embedding_name": "emb_1",
"title": "Embedding 2",
"description": "Second embedding",
"experiment_name": "homo_sapiens",
"data_type": "obs_embedding",
"census_version": "2023-12-15",
"submission_date": "2023-12-31",
},
"embedding-id-3": {
"id": "embedding-id-3",
"embedding_name": "emb_3",
"title": "Embedding 3",
"description": "Third embedding",
"experiment_name": "homo_sapiens",
"data_type": "obs_embedding",
"census_version": "2023-12-15",
"submission_date": "2023-11-15",
},
}
requests_mock.real_http = True
requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings)

embedding = get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding")
assert embedding is not None
assert embedding["id"] == "embedding-id-2" # most recent version
assert embedding == mock_embeddings["embedding-id-2"]

embedding = get_embedding_metadata_by_name("emb_3", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding")
assert embedding is not None
assert embedding["id"] == "embedding-id-3"
assert embedding == mock_embeddings["embedding-id-3"]

with pytest.raises(ValueError):
get_embedding_metadata_by_name("emb_2", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding")
get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "obs_embedding")
get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-10-15", embedding_type = "obs_embedding")
get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "var_embedding")



@pytest.mark.live_corpus
def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None:
mock_embeddings = {
"embedding-id-1": {
Expand Down Expand Up @@ -44,7 +99,6 @@ def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None:
assert len(embeddings) == 0


@pytest.mark.live_corpus
def test_get_all_census_versions_with_embedding(requests_mock: rm.Mocker) -> None:
mock_embeddings = {
"embedding-id-1": {
Expand Down
21 changes: 20 additions & 1 deletion api/python/cellxgene_census/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from cellxgene_census._util import _uri_join
from cellxgene_census._util import _uri_join, _extract_census_version
import cellxgene_census
import pytest
import re


def test_uri_join() -> None:
Expand All @@ -19,3 +22,19 @@ def test_uri_join() -> None:
assert _uri_join("file:///foo/bar", "a") == "file:///foo/a"

assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b"

@pytest.mark.live_corpus
def test_extract_census_version() -> None:
"""Ensures that extracting the Census version from a Collection object does not break"""

pattern = r'^\d{4}-\d{2}-\d{2}$'

with cellxgene_census.open_soma(census_version="stable") as census:
assert census is not None
version = _extract_census_version(census)
assert re.match(pattern, version)

with cellxgene_census.open_soma(census_version="latest") as census:
assert census is not None
version = _extract_census_version(census)
assert re.match(pattern, version)

0 comments on commit ddafdf0

Please sign in to comment.