chanzuckerberg · ivirshup · Jan 30, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/tools/cellxgene_census_builder/manifests/all_spatial_manifest.csv b/tools/cellxgene_census_builder/manifests/all_spatial_manifest.csv
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py
@@ -26,21 +26,22 @@
 from .datasets import Dataset
 from .globals import (
     ALLOWED_SPATIAL_ASSAYS,
-    CENSUS_OBS_PLATFORM_CONFIG,
-    CENSUS_OBS_TABLE_SPEC,
+    CENSUS_OBS_CORE_FIELDS,
+    CENSUS_OBS_FIELDS_MAPPED,
+    CENSUS_OBS_STATS_FIELDS,
     CENSUS_VAR_PLATFORM_CONFIG,
     CENSUS_VAR_TABLE_SPEC,
     CENSUS_X_LAYERS,
     CENSUS_X_LAYERS_PLATFORM_CONFIG,
-    CXG_OBS_COLUMNS_READ,
     CXG_VAR_COLUMNS_READ,
     DONOR_ID_IGNORE,
     FEATURE_DATASET_PRESENCE_MATRIX_NAME,
     FULL_GENE_ASSAY,
     MEASUREMENT_RNA_NAME,
+    USE_ARROW_DICTIONARY,
     SOMA_TileDB_Context,
 )
-from .schema_util import TableSpec
+from .schema_util import FieldSpec, TableSpec
 from .stats import get_obs_stats, get_var_stats
 from .summary_cell_counts import (
     accumulate_summary_counts,
@@ -96,24 +97,91 @@ class ExperimentSpecification:
 
     name: str
     label: str
+    root_collection: str
     anndata_cell_filter_spec: AnnDataFilterSpec
     organism_ontology_term_id: str
+    obs_term_fields: list[FieldSpec]
+    obs_term_fields_read: list[FieldSpec]
 
     @classmethod
     def create(
         cls,
         name: str,
         label: str,
+        root_collection: str,
         anndata_cell_filter_spec: AnnDataFilterSpec,
         organism_ontology_term_id: str,
+        obs_term_fields: list[FieldSpec],
+        obs_term_fields_read: list[FieldSpec],
     ) -> Self:
         """Factory method. Do not instantiate the class directly."""
-        return cls(name, label, anndata_cell_filter_spec, organism_ontology_term_id)
+        return cls(
+            name,
+            label,
+            root_collection,
+            anndata_cell_filter_spec,
+            organism_ontology_term_id,
+            obs_term_fields,
+            obs_term_fields_read,
+        )
 
     def is_exclusively_spatial(self) -> bool:
         """Returns True if the experiment specification EXCLUSIVELY involves spatial assays."""
         return self.anndata_cell_filter_spec["assay_ontology_term_ids"] == ALLOWED_SPATIAL_ASSAYS
 
+    @property
+    def obs_table_spec(self) -> TableSpec:
+        return TableSpec.create(
+            CENSUS_OBS_CORE_FIELDS + self.obs_term_fields + CENSUS_OBS_FIELDS_MAPPED + CENSUS_OBS_STATS_FIELDS,
+            use_arrow_dictionary=USE_ARROW_DICTIONARY,
+        )
+
+    @property
+    def obs_platform_config(self) -> dict[str, Any]:
+        """Materialization (filter pipelines, capacity, etc) of obs/var schema in TileDB is tuned by empirical testing."""
+        # Numeric cols
+        _NumericObsAttrs = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
+        if self.is_exclusively_spatial():
+            _NumericObsAttrs += ["array_row", "array_col", "in_tissue"]
+        # Categorical/dict-like columns
+        _DictLikeObsAttrs = [
+            f.name
+            for f in self.obs_table_spec.fields
+            if isinstance(f, FieldSpec) and f.is_dictionary
+            if f.is_dictionary and f.name not in (_NumericObsAttrs + ["soma_joinid"])
+        ]
+        # Dict filter varies depending on whether we are using dictionary types in the schema
+        _AllOtherObsAttrs = [
+            f.name
+            for f in self.obs_table_spec.fields
+            if f.name not in (_DictLikeObsAttrs + _NumericObsAttrs + ["soma_joinid"])
+        ]
+        # Dict filter varies depending on whether we are using dictionary types in the schema
+        _DictLikeFilter: list[Any] = (
+            [{"_type": "ZstdFilter", "level": 9}]
+            if USE_ARROW_DICTIONARY
+            else ["DictionaryFilter", {"_type": "ZstdFilter", "level": 19}]
+        )
+
+        return {
+            "tiledb": {
+                "create": {
+                    "capacity": 2**16,
+                    "dims": {"soma_joinid": {"filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}]}},
+                    "attrs": {
+                        **{
+                            k: {"filters": ["ByteShuffleFilter", {"_type": "ZstdFilter", "level": 9}]}
+                            for k in _NumericObsAttrs
+                        },
+                        **{k: {"filters": _DictLikeFilter} for k in _DictLikeObsAttrs},
+                        **{k: {"filters": [{"_type": "ZstdFilter", "level": 19}]} for k in _AllOtherObsAttrs},
+                    },
+                    "offsets_filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}],
+                    "allows_duplicates": True,
+                }
+            }
+        }
+
 
 class ExperimentBuilder:
     """Class that embodies the operators and state to build an Experiment.
@@ -169,8 +237,8 @@ def write_obs_dataframe(self) -> None:
         assert self.experiment is not None
         _assert_open_for_write(self.experiment)
 
-        obs_df = CENSUS_OBS_TABLE_SPEC.recategoricalize(self.obs_df)
-        obs_schema = CENSUS_OBS_TABLE_SPEC.to_arrow_schema(obs_df)
+        obs_df = self.specification.obs_table_spec.recategoricalize(self.obs_df)
+        obs_schema = self.specification.obs_table_spec.to_arrow_schema(obs_df)
 
         if obs_df is None or obs_df.empty:
             domain = None
@@ -182,7 +250,7 @@ def write_obs_dataframe(self) -> None:
             "obs",
             schema=obs_schema,
             index_column_names=["soma_joinid"],
-            platform_config=CENSUS_OBS_PLATFORM_CONFIG,
+            platform_config=self.specification.obs_platform_config,
             domain=domain,
         )
 
@@ -301,7 +369,7 @@ def get_obs_and_var(
             dataset,
             base_path=base_path,
             filter_spec=spec.anndata_cell_filter_spec,
-            obs_column_names=CXG_OBS_COLUMNS_READ,
+            obs_column_names=tuple(field.name for field in spec.obs_term_fields_read),
             var_column_names=CXG_VAR_COLUMNS_READ,
         ) as adata:
             logger.debug(f"{dataset.dataset_id}/{spec.name} - found {adata.n_obs} cells")
@@ -311,7 +379,8 @@ def get_obs_and_var(
                 logger.debug(f"{spec.name} - H5AD has no data after filtering, skipping {dataset.dataset_id}")
                 return pd.DataFrame(), pd.DataFrame()
 
-            obs_df = adata.obs.copy()
+            # Converting dtypes to pandas nullables dtypes to account for when datasets don't provide all columns
+            obs_df = adata.obs.copy().convert_dtypes()
             obs_df["dataset_id"] = dataset.dataset_id
 
             var_df = (
@@ -387,7 +456,7 @@ def per_dataset_summary_counts(eb: ExperimentBuilder, obs: pd.DataFrame) -> None
 
         # add columns to be completed later, e.g., summary stats such as mean of X
         add_placeholder_columns(
-            obs, CENSUS_OBS_TABLE_SPEC, default={np.int64: np.iinfo(np.int64).min, np.float64: np.nan}
+            obs, eb.specification.obs_table_spec, default={np.int64: np.iinfo(np.int64).min, np.float64: np.nan}
         )
         add_placeholder_columns(var, CENSUS_VAR_TABLE_SPEC, default={np.int64: 0})
 

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py
@@ -1,7 +1,13 @@
 import functools
 
 from .experiment_builder import ExperimentBuilder, ExperimentSpecification
-from .globals import ALLOWED_SPATIAL_ASSAYS, RNA_SEQ
+from .globals import (
+    ALLOWED_SPATIAL_ASSAYS,
+    CXG_OBS_FIELDS_READ,
+    CXG_OBS_TERM_FIELDS,
+    CXG_OBS_TERM_FIELDS_SPATIAL,
+    RNA_SEQ,
+)
 
 
 @functools.cache
@@ -15,39 +21,51 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
         ExperimentSpecification.create(
             name="homo_sapiens",
             label="Homo sapiens",
+            root_collection="census_data",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:9606",
                 "assay_ontology_term_ids": RNA_SEQ,
             },
             organism_ontology_term_id="NCBITaxon:9606",
+            obs_term_fields=CXG_OBS_TERM_FIELDS,
+            obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ,
         ),
         ExperimentSpecification.create(
             name="mus_musculus",
             label="Mus musculus",
+            root_collection="census_data",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:10090",
                 "assay_ontology_term_ids": RNA_SEQ,
             },
             organism_ontology_term_id="NCBITaxon:10090",
+            obs_term_fields=CXG_OBS_TERM_FIELDS,
+            obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ,
         ),
         # Experiments for spatial assays
         ExperimentSpecification.create(
             name="homo_sapiens",
             label="Homo sapiens",
+            root_collection="census_spatial_sequencing",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:9606",
                 "assay_ontology_term_ids": ALLOWED_SPATIAL_ASSAYS,
             },
             organism_ontology_term_id="NCBITaxon:9606",
+            obs_term_fields=CXG_OBS_TERM_FIELDS + CXG_OBS_TERM_FIELDS_SPATIAL,
+            obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ + CXG_OBS_TERM_FIELDS_SPATIAL,
         ),
         ExperimentSpecification.create(
             name="mus_musculus",
             label="Mus musculus",
+            root_collection="census_spatial_sequencing",
             anndata_cell_filter_spec={
                 "organism_ontology_term_id": "NCBITaxon:10090",
                 "assay_ontology_term_ids": ALLOWED_SPATIAL_ASSAYS,
             },
             organism_ontology_term_id="NCBITaxon:10090",
+            obs_term_fields=CXG_OBS_TERM_FIELDS + CXG_OBS_TERM_FIELDS_SPATIAL,
+            obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ + CXG_OBS_TERM_FIELDS_SPATIAL,
         ),
     ]
 

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -1,5 +1,4 @@
 import functools
-from typing import Any
 
 import pyarrow as pa
 import tiledbsoma as soma
@@ -82,36 +81,7 @@
 # than `string` or `binary`. There is no at-rest difference (TileDB-SOMA encodes both as large),
 # but the in-memory Arrow Array indices for string/binary can overflow as cell counts increase.
 #
-CXG_OBS_TERM_COLUMNS = [  # Columns pulled from the CXG H5AD without modification.
-    "assay",
-    "assay_ontology_term_id",
-    "cell_type",
-    "cell_type_ontology_term_id",
-    "development_stage",
-    "development_stage_ontology_term_id",
-    "disease",
-    "disease_ontology_term_id",
-    "donor_id",
-    "is_primary_data",
-    "observation_joinid",
-    "self_reported_ethnicity",
-    "self_reported_ethnicity_ontology_term_id",
-    "sex",
-    "sex_ontology_term_id",
-    "suspension_type",
-    "tissue",
-    "tissue_ontology_term_id",
-    "tissue_type",
-]
-CXG_OBS_COLUMNS_READ: tuple[str, ...] = (  # Columns READ from the CXG H5AD - see open_anndata()
-    *CXG_OBS_TERM_COLUMNS,
-    "organism",
-    "organism_ontology_term_id",
-)
-CENSUS_OBS_STATS_COLUMNS = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
-CENSUS_OBS_FIELDS: list[FieldSpec | tuple[str, pa.DataType]] = [
-    ("soma_joinid", pa.int64()),
-    FieldSpec(name="dataset_id", type=pa.large_string(), is_dictionary=True),
+CXG_OBS_TERM_FIELDS = [  # Columns pulled from the CXG H5AD without modification.
     FieldSpec(name="assay", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="assay_ontology_term_id", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="cell_type", type=pa.large_string(), is_dictionary=True),
@@ -121,8 +91,8 @@
     FieldSpec(name="disease", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="disease_ontology_term_id", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="donor_id", type=pa.large_string(), is_dictionary=True),
-    ("is_primary_data", pa.bool_()),
-    ("observation_joinid", pa.large_string()),
+    FieldSpec(name="is_primary_data", type=pa.bool_(), is_dictionary=False),
+    FieldSpec(name="observation_joinid", type=pa.large_string(), is_dictionary=False),
     FieldSpec(name="self_reported_ethnicity", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="self_reported_ethnicity_ontology_term_id", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="sex", type=pa.large_string(), is_dictionary=True),
@@ -131,57 +101,34 @@
     FieldSpec(name="tissue", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="tissue_ontology_term_id", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="tissue_type", type=pa.large_string(), is_dictionary=True),
+]
+CXG_OBS_TERM_FIELDS_SPATIAL = [  # Spatial speicific columns
+    FieldSpec(name="in_tissue", type=pa.int64(), is_dictionary=False),
+    FieldSpec(name="array_row", type=pa.int64(), is_dictionary=False),
+    FieldSpec(name="array_col", type=pa.int64(), is_dictionary=False),
+]
+CENSUS_OBS_FIELDS_MAPPED = [  # Annotation columns created by census builder
     FieldSpec(name="tissue_general", type=pa.large_string(), is_dictionary=True),
     FieldSpec(name="tissue_general_ontology_term_id", type=pa.large_string(), is_dictionary=True),
-    ("raw_sum", pa.float64()),
-    ("nnz", pa.int64()),
-    ("raw_mean_nnz", pa.float64()),
-    ("raw_variance_nnz", pa.float64()),
-    ("n_measured_vars", pa.int64()),
 ]
-CENSUS_OBS_TABLE_SPEC = TableSpec.create(CENSUS_OBS_FIELDS, use_arrow_dictionary=USE_ARROW_DICTIONARY)
-
-"""
-Materialization (filter pipelines, capacity, etc) of obs/var schema in TileDB is tuned by empirical testing.
-"""
-# Numeric columns
-_NumericObsAttrs = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
-# Categorical/dict-like columns
-_DictLikeObsAttrs = [
-    f.name
-    for f in CENSUS_OBS_FIELDS
-    if isinstance(f, FieldSpec) and f.is_dictionary
-    if f.is_dictionary and f.name not in (_NumericObsAttrs + ["soma_joinid"])
+CENSUS_OBS_STATS_FIELDS = [  # Columns for stats calculated during build
+    FieldSpec(name="raw_sum", type=pa.float64(), is_dictionary=False),
+    FieldSpec(name="nnz", type=pa.int64(), is_dictionary=False),
+    FieldSpec(name="raw_mean_nnz", type=pa.float64(), is_dictionary=False),
+    FieldSpec(name="raw_variance_nnz", type=pa.float64(), is_dictionary=False),
+    FieldSpec(name="n_measured_vars", type=pa.int64(), is_dictionary=False),
 ]
-# Best of the rest
-_AllOtherObsAttrs = [
-    f.name
-    for f in CENSUS_OBS_TABLE_SPEC.fields
-    if f.name not in (_DictLikeObsAttrs + _NumericObsAttrs + ["soma_joinid"])
+CENSUS_OBS_CORE_FIELDS = [
+    FieldSpec(name="soma_joinid", type=pa.int64(), is_dictionary=False),
+    FieldSpec(name="dataset_id", type=pa.large_string(), is_dictionary=True),
 ]
-# Dict filter varies depending on whether we are using dictionary types in the schema
-_DictLikeFilter: list[Any] = (
-    [{"_type": "ZstdFilter", "level": 9}]
-    if USE_ARROW_DICTIONARY
-    else ["DictionaryFilter", {"_type": "ZstdFilter", "level": 19}]
-)
-CENSUS_OBS_PLATFORM_CONFIG = {
-    "tiledb": {
-        "create": {
-            "capacity": 2**16,
-            "dims": {"soma_joinid": {"filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}]}},
-            "attrs": {
-                **{
-                    k: {"filters": ["ByteShuffleFilter", {"_type": "ZstdFilter", "level": 9}]} for k in _NumericObsAttrs
-                },
-                **{k: {"filters": _DictLikeFilter} for k in _DictLikeObsAttrs},
-                **{k: {"filters": [{"_type": "ZstdFilter", "level": 19}]} for k in _AllOtherObsAttrs},
-            },
-            "offsets_filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}],
-            "allows_duplicates": True,
-        }
-    }
-}
+# These are not actually written, so pyarrow type is not important
+CXG_OBS_FIELDS_READ = [  # Columns READ from the CXG H5AD - see open_anndata()
+    FieldSpec(name="organism", type=pa.large_string(), is_dictionary=True),
+    FieldSpec(name="organism_ontology_term_id", type=pa.large_string(), is_dictionary=True),
+]
+# TODO: remove and use FIELDS variable
+CENSUS_OBS_STATS_COLUMNS = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
 
 CXG_VAR_COLUMNS_READ: tuple[str, ...] = (
     "_index",