Skip to content

Commit

Permalink
add "organisms" table to census_info
Browse files Browse the repository at this point in the history
  • Loading branch information
bkmartinjr committed Mar 22, 2024
1 parent 67003f5 commit 93a7246
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 4 deletions.
55 changes: 55 additions & 0 deletions docs/cellxgene_census_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,61 @@ Example of this `SOMADataFrame`:
</tbody>
</table>

#### Census table of organisms – `census_obj["census_info"]["organisms"]``SOMADataframe`

Information about organisms whose cells are included in the Census MUST be included in a table modeled as a `SOMADataFrame`. Each row MUST correspond to an individual organism with the following columns:

<table>
<thead>
<tr>
<th>Column</th>
<th>Encoding</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>organism_ontology_term_id</td>
<td>string</td>
<td>As defined in the CELLxGENE dataset schema.</td>
</tr>
<tr>
<td>organism_label</td>
<td>string</td>
<td>Human-readable label as given by the ontology.</td>
</tr>
<tr>
<td>organism</td>
<td>string</td>
<td>Machine-friendly label used to name the SOMA Experiments, see below <a href="#census-data--census_objcensus_dataorganism--somaexperiment">Census Data section.</a></td>
</tr>
</tbody>
</table>

An example of this `SOMADataFrame` is shown below:

<table>
<thead>
<tr>
<th>organism_ontology_term_id</th>
<th>organism_label</th>
<th>organism</th>
</tr>
</thead>
<tbody>
<tr>
<td>NCBITaxon:9606</td>
<td>Homo sapiens</td>
<td>homo_sapiens</td>
</tr>
<tr>
<td>NCBITaxon:10090</td>
<td>Mus musculus</td>
<td>mus_musculus</td>
</tr>
</tbody>
</table>

### Census Data – `census_obj["census_data"][organism]``SOMAExperiment`

Data for *Homo sapiens* MUST be stored as a `SOMAExperiment` in `census_obj["homo_sapiens"]`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from ..build_state import CensusBuildArgs
from ..util import clamp, cpu_count
from .census_summary import create_census_summary
from .census_summary import create_census_info_organisms, create_census_summary
from .consolidate import submit_consolidate
from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest
from .experiment_builder import (
Expand Down Expand Up @@ -297,6 +297,7 @@ def build_step5_save_axis_and_summary_info(
create_dataset_manifest(census_info, filtered_datasets)
create_census_summary_cell_counts(census_info, [e.census_summary_cell_counts for e in experiment_builders])
create_census_summary(census_info, experiment_builders, build_tag)
create_census_info_organisms(census_info, experiment_builders)

logger.info("Build step 5 - Save axis and summary info - finished")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import tiledbsoma as soma

from .experiment_builder import ExperimentBuilder, get_summary_stats
from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, CXG_SCHEMA_VERSION
from .globals import CENSUS_INFO_ORGANISMS_NAME, CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, CXG_SCHEMA_VERSION

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -35,3 +35,27 @@ def create_census_summary(
CENSUS_SUMMARY_NAME, schema=pa.Schema.from_pandas(df, preserve_index=False), index_column_names=["soma_joinid"]
) as summary:
summary.write(pa.Table.from_pandas(df, preserve_index=False))


def create_census_info_organisms(
info_collection: soma.Collection, experiment_builders: Sequence[ExperimentBuilder]
) -> None:
logger.info("Create census organisms dataframe")

df = pd.DataFrame.from_records(
[
{
"organism_ontology_term_id": eb.specification.organism_ontology_term_id,
"organism_label": eb.specification.label,
"organism": eb.specification.name,
}
for eb in experiment_builders
]
)
df["soma_joinid"] = range(len(df))
with info_collection.add_new_dataframe(
CENSUS_INFO_ORGANISMS_NAME,
schema=pa.Schema.from_pandas(df, preserve_index=False),
index_column_names=["soma_joinid"],
) as summary:
summary.write(pa.Table.from_pandas(df, preserve_index=False))
Original file line number Diff line number Diff line change
Expand Up @@ -84,26 +84,30 @@ class ExperimentSpecification:
specification, independent of the datasets used to build the census.
Parameters:
* experiment "name" (eg, 'human'), must be unique in all experiments.
* experiment "name" (eg, 'homo_sapiens'), must be unique in all experiments.
* a human-readable label, e.g, "Homo sapiens"
* ontology ID
* an AnnData filter used to cherry pick data for the experiment
* external reference data used to build the experiment, e.g., gene length data
Usage: to create, use the factory method `ExperimentSpecification.create(...)`
"""

name: str
label: str
anndata_cell_filter_spec: AnnDataFilterSpec
organism_ontology_term_id: str

@classmethod
def create(
cls,
name: str,
label: str,
anndata_cell_filter_spec: AnnDataFilterSpec,
organism_ontology_term_id: str,
) -> Self:
"""Factory method. Do not instantiate the class directly."""
return cls(name, anndata_cell_filter_spec, organism_ontology_term_id)
return cls(name, label, anndata_cell_filter_spec, organism_ontology_term_id)


class ExperimentBuilder:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
return [ # The soma.Experiments we want to build
ExperimentSpecification.create(
name="homo_sapiens",
label="Homo sapiens",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:9606",
"assay_ontology_term_ids": RNA_SEQ,
Expand All @@ -22,6 +23,7 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
),
ExperimentSpecification.create(
name="mus_musculus",
label="Mus musculus",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:10090",
"assay_ontology_term_ids": RNA_SEQ,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@
# "census_info"/"summary_cell_counts" SOMA Dataframe
CENSUS_SUMMARY_NAME = "summary"

# "census_info"/"organisms" SOMA Dataframe
CENSUS_INFO_ORGANISMS_NAME = "organisms"

# "census_data"/{organism}/ms/"RNA" SOMA Matrix
MEASUREMENT_RNA_NAME = "RNA"

Expand Down

0 comments on commit 93a7246

Please sign in to comment.