add "organisms" table to census_info

chanzuckerberg · Mar 22, 2024 · 93a7246 · 93a7246
1 parent 67003f5
commit 93a7246
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 4 deletions.
diff --git a/docs/cellxgene_census_schema.md b/docs/cellxgene_census_schema.md
@@ -583,6 +583,61 @@ Example of this `SOMADataFrame`:
 </tbody>
 </table>
 
+#### Census table of organisms  – `census_obj["census_info"]["organisms"]` – `SOMADataframe`
+
+Information about organisms whose cells are included in the Census MUST be included in a table modeled as a `SOMADataFrame`. Each row MUST correspond to an individual organism with the following columns:
+
+<table>
+<thead>
+  <tr>
+    <th>Column</th>
+    <th>Encoding</th>
+    <th>Description</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>organism_ontology_term_id</td>
+    <td>string</td>
+    <td>As defined in the CELLxGENE dataset schema.</td>
+  </tr>
+  <tr>
+    <td>organism_label</td>
+    <td>string</td>
+    <td>Human-readable label as given by the ontology.</td>
+  </tr>
+  <tr>
+    <td>organism</td>
+    <td>string</td>
+    <td>Machine-friendly label used to name the SOMA Experiments, see below  <a href="#census-data--census_objcensus_dataorganism--somaexperiment">Census Data section.</a></td>
+  </tr>
+</tbody>
+</table>
+
+An example of this `SOMADataFrame` is shown below:
+
+<table>
+<thead>
+  <tr>
+    <th>organism_ontology_term_id</th>
+    <th>organism_label</th>
+    <th>organism</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td>NCBITaxon:9606</td>
+    <td>Homo sapiens</td>
+    <td>homo_sapiens</td>
+  </tr>
+  <tr>
+    <td>NCBITaxon:10090</td>
+    <td>Mus musculus</td>
+    <td>mus_musculus</td>
+  </tr>
+</tbody>
+</table>
+
 ### Census Data – `census_obj["census_data"][organism]` – `SOMAExperiment`
 
 Data for *Homo sapiens* MUST be stored as a `SOMAExperiment` in `census_obj["homo_sapiens"]`.

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py
@@ -11,7 +11,7 @@
 
 from ..build_state import CensusBuildArgs
 from ..util import clamp, cpu_count
-from .census_summary import create_census_summary
+from .census_summary import create_census_info_organisms, create_census_summary
 from .consolidate import submit_consolidate
 from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest
 from .experiment_builder import (
@@ -297,6 +297,7 @@ def build_step5_save_axis_and_summary_info(
         create_dataset_manifest(census_info, filtered_datasets)
         create_census_summary_cell_counts(census_info, [e.census_summary_cell_counts for e in experiment_builders])
         create_census_summary(census_info, experiment_builders, build_tag)
+        create_census_info_organisms(census_info, experiment_builders)
 
     logger.info("Build step 5 - Save axis and summary info - finished")
 

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/census_summary.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/census_summary.py
@@ -6,7 +6,7 @@
 import tiledbsoma as soma
 
 from .experiment_builder import ExperimentBuilder, get_summary_stats
-from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, CXG_SCHEMA_VERSION
+from .globals import CENSUS_INFO_ORGANISMS_NAME, CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, CXG_SCHEMA_VERSION
 
 logger = logging.getLogger(__name__)
 
@@ -35,3 +35,27 @@ def create_census_summary(
         CENSUS_SUMMARY_NAME, schema=pa.Schema.from_pandas(df, preserve_index=False), index_column_names=["soma_joinid"]
     ) as summary:
         summary.write(pa.Table.from_pandas(df, preserve_index=False))
+
+
+def create_census_info_organisms(
+    info_collection: soma.Collection, experiment_builders: Sequence[ExperimentBuilder]
+) -> None:
+    logger.info("Create census organisms dataframe")
+
+    df = pd.DataFrame.from_records(
+        [
+            {
+                "organism_ontology_term_id": eb.specification.organism_ontology_term_id,
+                "organism_label": eb.specification.label,
+                "organism": eb.specification.name,
+            }
+            for eb in experiment_builders
+        ]
+    )
+    df["soma_joinid"] = range(len(df))
+    with info_collection.add_new_dataframe(
+        CENSUS_INFO_ORGANISMS_NAME,
+        schema=pa.Schema.from_pandas(df, preserve_index=False),
+        index_column_names=["soma_joinid"],
+    ) as summary:
+        summary.write(pa.Table.from_pandas(df, preserve_index=False))
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py
@@ -84,26 +84,30 @@ class ExperimentSpecification:
     specification, independent of the datasets used to build the census.
 
     Parameters:
-    * experiment "name" (eg, 'human'), must be unique in all experiments.
+    * experiment "name" (eg, 'homo_sapiens'), must be unique in all experiments.
+    * a human-readable label, e.g, "Homo sapiens"
+    * ontology ID
     * an AnnData filter used to cherry pick data for the experiment
     * external reference data used to build the experiment, e.g., gene length data
 
     Usage: to create, use the factory method `ExperimentSpecification.create(...)`
     """
 
     name: str
+    label: str
     anndata_cell_filter_spec: AnnDataFilterSpec
     organism_ontology_term_id: str
 
     @classmethod
     def create(
         cls,
         name: str,
+        label: str,
         anndata_cell_filter_spec: AnnDataFilterSpec,
         organism_ontology_term_id: str,
     ) -> Self:
         """Factory method. Do not instantiate the class directly."""
-        return cls(name, anndata_cell_filter_spec, organism_ontology_term_id)
+        return cls(name, label, anndata_cell_filter_spec, organism_ontology_term_id)
 
 
 class ExperimentBuilder:

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py
@@ -14,6 +14,7 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
     return [  # The soma.Experiments we want to build
         ExperimentSpecification.create(
             name="homo_sapiens",
+            label="Homo sapiens",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:9606",
                 "assay_ontology_term_ids": RNA_SEQ,
@@ -22,6 +23,7 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
         ),
         ExperimentSpecification.create(
             name="mus_musculus",
+            label="Mus musculus",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:10090",
                 "assay_ontology_term_ids": RNA_SEQ,

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -63,6 +63,9 @@
 # "census_info"/"summary_cell_counts" SOMA Dataframe
 CENSUS_SUMMARY_NAME = "summary"
 
+# "census_info"/"organisms" SOMA Dataframe
+CENSUS_INFO_ORGANISMS_NAME = "organisms"
+
 # "census_data"/{organism}/ms/"RNA" SOMA Matrix
 MEASUREMENT_RNA_NAME = "RNA"