refactor ExperimentBuilder and validator for improved validator isola…

…tion (#237) * add attrs to dependencies * fix mis-use of AnnData * refactor experiment builder for better validator isolation * dead code removal
chanzuckerberg · Mar 3, 2023 · 57972f3 · 57972f3
1 parent 7851047
commit 57972f3
Show file tree

Hide file tree

Showing 7 changed files with 212 additions and 188 deletions.
diff --git a/tools/cell_census_builder/__main__.py b/tools/cell_census_builder/__main__.py
@@ -13,7 +13,12 @@
 from .census_summary import create_census_summary
 from .consolidate import consolidate
 from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest
-from .experiment_builder import ExperimentBuilder, populate_X_layers, reopen_experiment_builders
+from .experiment_builder import (
+    ExperimentBuilder,
+    ExperimentSpecification,
+    populate_X_layers,
+    reopen_experiment_builders,
+)
 from .globals import (
     CENSUS_DATA_NAME,
     CENSUS_INFO_NAME,
@@ -30,7 +35,7 @@
 from .validate import validate
 
 
-def make_experiment_builders() -> List[ExperimentBuilder]:
+def make_experiment_specs() -> List[ExperimentSpecification]:
     """
     Define all soma.Experiments to build in the census.
 
@@ -46,21 +51,19 @@ def make_experiment_builders() -> List[ExperimentBuilder]:
         GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz",
         GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz",
     ]
-    experiment_builders = [  # The soma.Experiments we want to build
-        ExperimentBuilder(
+    return [  # The soma.Experiments we want to build
+        ExperimentSpecification.create(
             name="homo_sapiens",
             anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ),
             gene_feature_length_uris=GENE_LENGTH_URIS,
         ),
-        ExperimentBuilder(
+        ExperimentSpecification.create(
             name="mus_musculus",
             anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ),
             gene_feature_length_uris=GENE_LENGTH_URIS,
         ),
     ]
 
-    return experiment_builders
-
 
 def main() -> int:
     parser = create_args_parser()
@@ -73,15 +76,16 @@ def main() -> int:
     soma_path = uricat(args.uri, args.build_tag, "soma")
     assets_path = uricat(args.uri, args.build_tag, "h5ads")
 
-    # create the experiment builders
-    experiment_builders = make_experiment_builders()
+    # create the experiment specifications and builders
+    experiment_specifications = make_experiment_specs()
+    experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications]
 
     cc = 0
     if args.subcommand == "build":
         cc = build(args, soma_path, assets_path, experiment_builders)
 
     if cc == 0 and (args.subcommand == "validate" or args.validate):
-        validate(args, soma_path, assets_path, experiment_builders)
+        validate(args, soma_path, assets_path, experiment_specifications)
 
     return cc
 
@@ -145,9 +149,6 @@ def build(
     # Step 5- write out dataset manifest and summary information
     build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag)
 
-    for eb in experiment_builders:
-        eb.build_completed = True
-
     # consolidate TileDB data
     if args.consolidate:
         consolidate(args, root_collection.uri)

diff --git a/tools/cell_census_builder/datasets.py b/tools/cell_census_builder/datasets.py
@@ -69,6 +69,8 @@ def create_dataset_manifest(info_collection: soma.Collection, datasets: List[Dat
     logging.info("Creating dataset_manifest")
     manifest_df = Dataset.to_dataframe(datasets)
     manifest_df = manifest_df[CENSUS_DATASETS_COLUMNS + ["soma_joinid"]]
+    if len(manifest_df) == 0:
+        return
 
     # write to a SOMA dataframe
     with info_collection.add_new_dataframe(

diff --git a/tools/cell_census_builder/experiment_builder.py b/tools/cell_census_builder/experiment_builder.py
@@ -7,6 +7,7 @@
 from typing import Dict, Generator, List, Optional, Sequence, Tuple, TypedDict, Union, overload
 
 import anndata
+import attrs
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
@@ -15,6 +16,7 @@
 import tiledbsoma as soma
 from scipy import sparse
 from somacore.options import OpenMode
+from typing_extensions import Self
 
 from .anndata import AnnDataFilterSpec, make_anndata_cell_filter, open_anndata
 from .datasets import Dataset
@@ -65,25 +67,58 @@ def _assert_open_for_write(obj: somacore.SOMAObject) -> None:
     assert not obj.closed
 
 
-class ExperimentBuilder:
+@attrs.define(frozen=True)
+class ExperimentSpecification:
     """
-    Class to help build a parameterized SOMA experiment, where key parameters are:
+    Declarative "specification" of a SOMA experiment. This is a read-only
+    specification, independent of the datasets used to build the census.
+
+    Parameters:
     * experiment "name" (eg, 'human'), must be unique in all experiments.
     * an AnnData filter used to cherry pick data for the experiment
-    * methods to progressively build the experiment
+    * external reference data used to build the experiment, e.g., gene length data
 
-    The creation and driving of these objects is done by the main loop.
+    Usage: to create, use the factory method `ExperimentSpecification.create(...)`
     """
 
     name: str
     anndata_cell_filter_spec: AnnDataFilterSpec
     gene_feature_length_uris: List[str]
     gene_feature_length: pd.DataFrame
 
-    def __init__(self, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_feature_length_uris: List[str]):
-        self.name = name
-        self.anndata_cell_filter_spec = anndata_cell_filter_spec
-        self.gene_feature_length_uris = gene_feature_length_uris
+    @classmethod
+    def create(
+        cls, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_feature_length_uris: List[str]
+    ) -> Self:
+        """Factory method. Do not instantiate the class directly."""
+        gene_feature_length = cls._load_gene_feature_length(gene_feature_length_uris)
+        logging.info(f"Loaded gene lengths external reference for {name}, {len(gene_feature_length)} genes.")
+        return cls(name, anndata_cell_filter_spec, gene_feature_length_uris, gene_feature_length)
+
+    @classmethod
+    def _load_gene_feature_length(cls, gene_feature_length_uris: Sequence[str]) -> pd.DataFrame:
+        """
+        Private. Load any external assets required to create the experiment.
+        """
+        return pd.concat(
+            pd.read_csv(
+                io.BytesIO(cat_file(uri)),
+                names=["feature_id", "feature_name", "gene_version", "feature_length"],
+            )
+            .set_index("feature_id")
+            .drop(columns=["feature_name", "gene_version"])
+            for uri in gene_feature_length_uris
+        )
+
+
+class ExperimentBuilder:
+    """
+    Class that embodies the operators and state to build an Experiment.
+    The creation and driving of these objects is done by the main loop.
+    """
+
+    def __init__(self, specification: ExperimentSpecification):
+        self.specification = specification
 
         # accumulated state
         self.n_obs: int = 0
@@ -98,24 +133,18 @@ def __init__(self, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_
         self.experiment_uri: Optional[str] = None  # initialized in create()
         self.global_var_joinids: Optional[pd.DataFrame] = None
         self.presence: Dict[int, Tuple[npt.NDArray[np.bool_], npt.NDArray[np.int64]]] = {}
-        self.build_completed: bool = False
 
-        self.load_assets()
+    @property
+    def name(self) -> str:
+        return self.specification.name
 
-    def load_assets(self) -> None:
-        """
-        Load any external assets required to create the experiment.
-        """
-        self.gene_feature_length = pd.concat(
-            pd.read_csv(
-                io.BytesIO(cat_file(uri)),
-                names=["feature_id", "feature_name", "gene_version", "feature_length"],
-            )
-            .set_index("feature_id")
-            .drop(columns=["feature_name", "gene_version"])
-            for uri in self.gene_feature_length_uris
-        )
-        logging.info(f"Loaded gene lengths external reference for {self.name}, {len(self.gene_feature_length)} genes.")
+    @property
+    def anndata_cell_filter_spec(self) -> AnnDataFilterSpec:
+        return self.specification.anndata_cell_filter_spec
+
+    @property
+    def gene_feature_length(self) -> pd.DataFrame:
+        return self.specification.gene_feature_length
 
     def create(self, census_data: soma.Collection) -> None:
         """Create experiment within the specified Collection with a single Measurement."""
@@ -260,7 +289,8 @@ def populate_presence_matrix(self, datasets: List[Dataset]) -> None:
         """
         _assert_open_for_write(self.experiment)
 
-        # SOMA does not currently support empty arrays, so special case this corner-case.
+        # SOMA does not currently arrays with a zero length domain, so special case this corner-case
+        # where no data has been read for this experiment.
         if len(self.presence) > 0:
             # sanity check
             assert len(self.presence) == self.n_datasets

diff --git a/tools/cell_census_builder/tests/conftest.py b/tools/cell_census_builder/tests/conftest.py
@@ -27,7 +27,7 @@ class Organism:
 
 
 def get_h5ad(organism: Organism) -> anndata.AnnData:
-    X = np.random.randint(5, size=(4, 4))
+    X = np.random.randint(5, size=(4, 4)).astype(np.float32)
     # The builder only supports sparse matrices
     X = sparse.csr_matrix(X)
 
@@ -54,7 +54,8 @@ def get_h5ad(organism: Organism) -> anndata.AnnData:
             "sex": "test",
             "tissue": "test",
             "organism": "test",
-        }
+        },
+        index=["1", "2", "3", "4"],
     )
     obs = obs_dataframe
 

diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py
@@ -9,8 +9,9 @@
 import tiledb
 import tiledbsoma as soma
 
-from tools.cell_census_builder.__main__ import build, make_experiment_builders
+from tools.cell_census_builder.__main__ import build, make_experiment_specs
 from tools.cell_census_builder.datasets import Dataset
+from tools.cell_census_builder.experiment_builder import ExperimentBuilder
 from tools.cell_census_builder.globals import (
     CENSUS_DATA_NAME,
     CENSUS_INFO_NAME,
@@ -32,7 +33,9 @@ def test_base_builder_creation(
         "tools.cell_census_builder.validate.validate_consolidation", return_value=True
     ):
         # Patching consolidate_tiledb_object, becuase is uses to much memory to run in github actions.
-        experiment_builders = make_experiment_builders()
+        experiment_specifications = make_experiment_specs()
+        experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications]
+
         from types import SimpleNamespace
 
         args = SimpleNamespace(multi_process=False, consolidate=True, build_tag="test_tag", max_workers=1, verbose=True)
@@ -42,7 +45,7 @@ def test_base_builder_creation(
         assert return_value == 0
 
         # validate the cell_census
-        return_value = validate(args, soma_path, assets_path, experiment_builders)  # type: ignore[arg-type]
+        return_value = validate(args, soma_path, assets_path, experiment_specifications)  # type: ignore[arg-type]
         assert return_value is True
 
         # Query the census and do assertions