Skip to content

Commit

Permalink
lint part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi committed Mar 8, 2024
1 parent 337b571 commit 77d28a5
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 48 deletions.
10 changes: 4 additions & 6 deletions api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
import tiledbsoma as soma
from somacore.options import SparseDFCoord

from ._util import _extract_census_version

from ._experiment import _get_experiment
from ._util import _extract_census_version

CENSUS_EMBEDDINGS_LOCATION_BASE_URI = "s3://cellxgene-contrib-public/contrib/cell-census/soma"

Expand Down Expand Up @@ -103,13 +102,14 @@ def get_anndata(
column_names=column_names,
X_layers=X_layers,
obsm_layers=obsm_layers,
varm_layers=varm_layers
varm_layers=varm_layers,
)

# If add_obs_embeddings or add_var_embeddings are defined, inject them in the appropriate slot
if add_obs_embeddings or add_var_embeddings:
if add_obs_embeddings is not None:
obs_soma_joinids = query.obs_joinids()
from cellxgene_census.experimental import get_embedding, get_embedding_metadata_by_name

census_version = _extract_census_version(census)
for emb in add_obs_embeddings:
emb_metadata = get_embedding_metadata_by_name(emb, organism, census_version, "obs_embedding")
Expand All @@ -118,5 +118,3 @@ def get_anndata(
adata.obsm[emb] = embedding

return adata


6 changes: 4 additions & 2 deletions api/python/cellxgene_census/src/cellxgene_census/_util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import urllib.parse

import tiledbsoma as soma


Expand All @@ -20,6 +21,7 @@ def _uri_join(base: str, url: str) -> str:
]
return urllib.parse.urlunparse(parts)

def _extract_census_version(census: soma.Collection):

def _extract_census_version(census: soma.Collection) -> str:
"""Extract the Census version from the given Census object."""
return urllib.parse.urlparse(census.uri).path.split("/")[2]
return urllib.parse.urlparse(census.uri).path.split("/")[2]
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""Experimental API for the CELLxGENE Discover Census."""

from ._embedding import get_embedding, get_embedding_metadata, get_embedding_metadata_by_name, get_all_available_embeddings, get_all_census_versions_with_embedding
from ._embedding import (
get_all_available_embeddings,
get_all_census_versions_with_embedding,
get_embedding,
get_embedding_metadata,
get_embedding_metadata_by_name,
)

__all__ = [
"get_embedding",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import numpy.typing as npt
import pandas as pd
import pyarrow as pa
import tiledbsoma as soma
import requests
import tiledbsoma as soma

from .._open import get_default_soma_context, open_soma
from .._release_directory import get_census_version_directory
Expand Down Expand Up @@ -140,8 +140,11 @@ def get_embedding(

return embedding

def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding") -> dict[str, Any]:
"""Return metadata for a specific embedding. If more embeddings match the query parameters,

def get_embedding_metadata_by_name(
embedding_name: str, organism: str, census_version: str, embedding_type: str | None = "obs_embedding"
) -> dict[str, Any]:
"""Return metadata for a specific embedding. If more embeddings match the query parameters,
the most recent one will be returned.
Args:
Expand All @@ -167,14 +170,20 @@ def get_embedding_metadata_by_name(embedding_name: str, organism: str, census_ve
manifest = response.json()
embeddings = []
for _, obj in manifest.items():
if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type and obj["census_version"] == census_version:
if (
obj["embedding_name"] == embedding_name
and obj["experiment_name"] == organism
and obj["data_type"] == embedding_type
and obj["census_version"] == census_version
):
embeddings.append(obj)

if len(embeddings) == 0:
raise ValueError(f"No embeddings found for {embedding_name}, {organism}, {census_version}, {embedding_type}")

return sorted(embeddings, key=lambda x: x["submission_date"])[-1]


def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:
"""Return a dictionary of all available embeddings for a given Census version.
Expand All @@ -188,19 +197,19 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:
Examples:
>>> get_all_available_embeddings('2023-12-15')
[{
'experiment_name': 'experiment_1',
'measurement_name': 'RNA',
'organism': "homo_sapiens",
'census_version': '2023-12-15',
'n_embeddings': 1000,
'n_features': 200,
'experiment_name': 'experiment_1',
'measurement_name': 'RNA',
'organism': "homo_sapiens",
'census_version': '2023-12-15',
'n_embeddings': 1000,
'n_features': 200,
'uri': 's3://bucket/embedding_1'
}]
"""
response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
response.raise_for_status()

embeddings = []
manifest = response.json()
for _, obj in manifest.items():
Expand All @@ -209,28 +218,34 @@ def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:

return embeddings

def get_all_census_versions_with_embedding(embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding") -> list[str]:
"""
Get a list of all census versions that contain a specific embedding.

def get_all_census_versions_with_embedding(
embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding"
) -> list[str]:
"""Get a list of all census versions that contain a specific embedding.
Args:
embedding_name:
embedding_name:
The name of the embedding, e.g. "scvi".
organism:
organism:
The organism for which the embedding is associated.
embedding_type:
embedding_type:
The type of embedding. Defaults to "obs_embedding".
Returns:
A list of census versions that contain the specified embedding.
"""
response = requests.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL)
response.raise_for_status()

versions = set()
manifest = response.json()
for _, obj in manifest.items():
if obj["embedding_name"] == embedding_name and obj["experiment_name"] == organism and obj["data_type"] == embedding_type:
if (
obj["embedding_name"] == embedding_name
and obj["experiment_name"] == organism
and obj["data_type"] == embedding_type
):
versions.add(obj["census_version"])

return sorted(list(versions))
return sorted(versions)
38 changes: 26 additions & 12 deletions api/python/cellxgene_census/tests/experimental/test_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import pytest
import requests_mock as rm

from cellxgene_census.experimental import get_all_available_embeddings, get_all_census_versions_with_embedding, get_embedding_metadata_by_name

from cellxgene_census.experimental import (
get_all_available_embeddings,
get_all_census_versions_with_embedding,
get_embedding_metadata_by_name,
)
from cellxgene_census.experimental._embedding import CELL_CENSUS_EMBEDDINGS_MANIFEST_URL


Expand All @@ -16,7 +19,7 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None:
"experiment_name": "homo_sapiens",
"data_type": "obs_embedding",
"census_version": "2023-12-15",
"submission_date": "2023-11-15"
"submission_date": "2023-11-15",
},
"embedding-id-2": {
"id": "embedding-id-2",
Expand All @@ -42,23 +45,34 @@ def test_get_embedding_metadata_by_name(requests_mock: rm.Mocker) -> None:
requests_mock.real_http = True
requests_mock.get(CELL_CENSUS_EMBEDDINGS_MANIFEST_URL, json=mock_embeddings)

embedding = get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding")
embedding = get_embedding_metadata_by_name(
"emb_1", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding"
)
assert embedding is not None
assert embedding["id"] == "embedding-id-2" # most recent version
assert embedding["id"] == "embedding-id-2" # most recent version
assert embedding == mock_embeddings["embedding-id-2"]

embedding = get_embedding_metadata_by_name("emb_3", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding")
embedding = get_embedding_metadata_by_name(
"emb_3", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding"
)
assert embedding is not None
assert embedding["id"] == "embedding-id-3"
assert embedding["id"] == "embedding-id-3"
assert embedding == mock_embeddings["embedding-id-3"]

with pytest.raises(ValueError):
get_embedding_metadata_by_name("emb_2", organism = "homo_sapiens", census_version = "2023-12-15", embedding_type = "obs_embedding")
get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "obs_embedding")
get_embedding_metadata_by_name("emb_1", organism = "homo_sapiens", census_version = "2023-10-15", embedding_type = "obs_embedding")
get_embedding_metadata_by_name("emb_1", organism = "mus_musculus", census_version = "2023-12-15", embedding_type = "var_embedding")
get_embedding_metadata_by_name(
"emb_2", organism="homo_sapiens", census_version="2023-12-15", embedding_type="obs_embedding"
)
get_embedding_metadata_by_name(
"emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="obs_embedding"
)
get_embedding_metadata_by_name(
"emb_1", organism="homo_sapiens", census_version="2023-10-15", embedding_type="obs_embedding"
)
get_embedding_metadata_by_name(
"emb_1", organism="mus_musculus", census_version="2023-12-15", embedding_type="var_embedding"
)



def test_get_all_available_embeddings(requests_mock: rm.Mocker) -> None:
mock_embeddings = {
Expand Down
1 change: 1 addition & 0 deletions api/python/cellxgene_census/tests/test_get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def test_get_anndata_obsm_two_layers(census: soma.Collection, obsm_layers: List[
assert obsm_layer in ad.obsm.keys()
assert ad.obsm[obsm_layer].shape[0] == 100


@pytest.mark.live_corpus
@pytest.mark.parametrize("add_obs_embeddings", [["scvi", "geneformer"]])
def test_get_anndata_add_obs_embeddings(census: soma.Collection, add_obs_embeddings: List[str]) -> None:
Expand Down
13 changes: 8 additions & 5 deletions api/python/cellxgene_census/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from cellxgene_census._util import _uri_join, _extract_census_version
import cellxgene_census
import pytest
import re

import pytest

import cellxgene_census
from cellxgene_census._util import _extract_census_version, _uri_join


def test_uri_join() -> None:
assert _uri_join("https://foo/", "bar") == "https://foo/bar"
Expand All @@ -23,11 +25,12 @@ def test_uri_join() -> None:

assert _uri_join("https://foo/bar", "https://a/b") == "https://a/b"


@pytest.mark.live_corpus
def test_extract_census_version() -> None:
"""Ensures that extracting the Census version from a Collection object does not break"""

pattern = r'^\d{4}-\d{2}-\d{2}$'
pattern = r"^\d{4}-\d{2}-\d{2}$"

with cellxgene_census.open_soma(census_version="stable") as census:
assert census is not None
Expand All @@ -37,4 +40,4 @@ def test_extract_census_version() -> None:
with cellxgene_census.open_soma(census_version="latest") as census:
assert census is not None
version = _extract_census_version(census)
assert re.match(pattern, version)
assert re.match(pattern, version)
2 changes: 1 addition & 1 deletion tools/census_contrib/src/census_contrib/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding

# 7. Name must have length < 128 characters
MAX_NAME_LENGTH = 128
if not metadata.name or len(metadata.name) > MAX_NAME_LENGTH:
if not metadata.embedding_name or len(metadata.embedding_name) > MAX_NAME_LENGTH:
raise ValueError(
f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length",
)
Expand Down

0 comments on commit 77d28a5

Please sign in to comment.