Skip to content

Commit

Permalink
Draft
Browse files Browse the repository at this point in the history
  • Loading branch information
ebezzi committed Feb 28, 2024
1 parent abbce9f commit d126590
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def get_anndata(
X_name: str = "raw",
X_layers: Optional[Sequence[str]] = (),
obsm_layers: Optional[Sequence[str]] = (),
varm_layers: Optional[Sequence[str]] = (),
obs_value_filter: Optional[str] = None,
obs_coords: Optional[SparseDFCoord] = None,
var_value_filter: Optional[str] = None,
Expand Down Expand Up @@ -58,6 +59,12 @@ def get_anndata(
Columns to fetch for ``obs`` and ``var`` dataframes.
obsm_layers:
Additional obsm layers to read and return in the ``obsm`` slot.
Use :func:`get_all_available_embeddings` to retrieve available embeddings
for this Census version and organism.
varm_layers:
Additional varm layers to read and return in the ``varm`` slot.
Use :func:`get_all_available_embeddings` to retrieve available embeddings
for this Census version and organism.
Returns:
An :class:`anndata.AnnData` object containing the census slice.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,48 @@ def get_embedding(
np.put(embedding.reshape(-1), indices, emb)

return embedding


def get_all_available_embeddings(census_version: str) -> list[dict[str, Any]]:
"""Return a dictionary of all available embeddings for a given Census version.
Args:
census_version:
The Census version tag, e.g., ``"2023-12-15"``.
Returns:
A list of dictionaries, each containing metadata describing an available embedding.
Examples:
>>> get_all_available_embeddings('2023-12-15')
[{
'experiment_name': 'experiment_1',
'measurement_name': 'RNA',
'organism': "homo_sapiens",
'census_version': '2023-12-15',
'n_embeddings': 1000,
'n_features': 200,
'uri': 's3://bucket/embedding_1'
}]
"""
pass


def get_all_census_versions_with_embedding(
embedding_name: str, organism: str, embedding_type: str | None = "obs_embedding"
) -> list[str]:
"""Get a list of all census versions that contain a specific embedding.
Args:
embedding_name:
The name of the embedding.
organism:
The organism for which the embedding is associated.
embedding_type:
The type of embedding. Defaults to "obs_embedding".
Returns:
A list of census versions that contain the specified embedding.
"""
pass
2 changes: 2 additions & 0 deletions tools/census_contrib/embedding_metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Each embedding will contain a variety of metadata stored in the SOMA `metadata`
| Field name | Required | Type | Description |
| ---------------------- | -------- | ------------- | ------------------------------------------------------------------------------------- |
| id | required | string | CZI-assigned accession ID for this embedding |
| embedding_name | required | string | Name of the algorithm used to generate the embedding |
| title | required | string | Brief project title |
| description | required | string | Succinct description of the method and characteristics of the embeddings and model |
| primary_contact | required | Contact | Primary contact person for these embeddings. |
Expand Down Expand Up @@ -49,6 +50,7 @@ For example:
```json
{
"id": "CxG-contrib-99999",
"embedding_name": "model_algo",
"title": "An embedding",
"description": "Longer description of the embedding and method used to generate it",
"primary_contact": {
Expand Down
9 changes: 9 additions & 0 deletions tools/census_contrib/src/census_contrib/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Contact:
@attrs.define(kw_only=True, frozen=True)
class EmbeddingMetadata:
id: str = field(validator=validators.instance_of(str))
embedding_name: str = field(validator=validators.instance_of(str))
title: str = field(validator=validators.instance_of(str))
description: str = field(validator=validators.instance_of(str))
primary_contact: Contact = field(validator=validators.instance_of(Contact))
Expand Down Expand Up @@ -118,6 +119,7 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding
4. All supplied URLs must resolve
5. Title must have length < 128 characters
6. Description must have length < 2048 characters
7. Name must have length < 24 characters
"""
if not metadata.id:
raise ValueError("metadata is missing 'id' (accession)")
Expand All @@ -140,6 +142,13 @@ def validate_metadata(args: Arguments, metadata: EmbeddingMetadata) -> Embedding
"Metadata: description must be string between 1 and {MAX_DESCRIPTION_LENGTH} characters in length",
)

# 7. Name must have length < 24 characters
MAX_NAME_LENGTH = 24
if not metadata.name or len(metadata.name) > MAX_NAME_LENGTH:
raise ValueError(
f"Metadata: name must be string between 1 and {MAX_NAME_LENGTH} characters in length",
)

return metadata


Expand Down

0 comments on commit d126590

Please sign in to comment.