Skip to content

Commit

Permalink
refactor ExperimentBuilder and validator for improved validator isola…
Browse files Browse the repository at this point in the history
…tion (#237)

* add attrs to dependencies

* fix mis-use of AnnData

* refactor experiment builder for better validator isolation

* dead code removal
  • Loading branch information
Bruce Martin authored Mar 3, 2023
1 parent 7851047 commit 57972f3
Show file tree
Hide file tree
Showing 7 changed files with 212 additions and 188 deletions.
27 changes: 14 additions & 13 deletions tools/cell_census_builder/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
from .census_summary import create_census_summary
from .consolidate import consolidate
from .datasets import Dataset, assign_dataset_soma_joinids, create_dataset_manifest
from .experiment_builder import ExperimentBuilder, populate_X_layers, reopen_experiment_builders
from .experiment_builder import (
ExperimentBuilder,
ExperimentSpecification,
populate_X_layers,
reopen_experiment_builders,
)
from .globals import (
CENSUS_DATA_NAME,
CENSUS_INFO_NAME,
Expand All @@ -30,7 +35,7 @@
from .validate import validate


def make_experiment_builders() -> List[ExperimentBuilder]:
def make_experiment_specs() -> List[ExperimentSpecification]:
"""
Define all soma.Experiments to build in the census.
Expand All @@ -46,21 +51,19 @@ def make_experiment_builders() -> List[ExperimentBuilder]:
GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz",
GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz",
]
experiment_builders = [ # The soma.Experiments we want to build
ExperimentBuilder(
return [ # The soma.Experiments we want to build
ExperimentSpecification.create(
name="homo_sapiens",
anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ),
gene_feature_length_uris=GENE_LENGTH_URIS,
),
ExperimentBuilder(
ExperimentSpecification.create(
name="mus_musculus",
anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ),
gene_feature_length_uris=GENE_LENGTH_URIS,
),
]

return experiment_builders


def main() -> int:
parser = create_args_parser()
Expand All @@ -73,15 +76,16 @@ def main() -> int:
soma_path = uricat(args.uri, args.build_tag, "soma")
assets_path = uricat(args.uri, args.build_tag, "h5ads")

# create the experiment builders
experiment_builders = make_experiment_builders()
# create the experiment specifications and builders
experiment_specifications = make_experiment_specs()
experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications]

cc = 0
if args.subcommand == "build":
cc = build(args, soma_path, assets_path, experiment_builders)

if cc == 0 and (args.subcommand == "validate" or args.validate):
validate(args, soma_path, assets_path, experiment_builders)
validate(args, soma_path, assets_path, experiment_specifications)

return cc

Expand Down Expand Up @@ -145,9 +149,6 @@ def build(
# Step 5- write out dataset manifest and summary information
build_step5_populate_summary_info(root_collection, experiment_builders, filtered_datasets, args.build_tag)

for eb in experiment_builders:
eb.build_completed = True

# consolidate TileDB data
if args.consolidate:
consolidate(args, root_collection.uri)
Expand Down
2 changes: 2 additions & 0 deletions tools/cell_census_builder/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def create_dataset_manifest(info_collection: soma.Collection, datasets: List[Dat
logging.info("Creating dataset_manifest")
manifest_df = Dataset.to_dataframe(datasets)
manifest_df = manifest_df[CENSUS_DATASETS_COLUMNS + ["soma_joinid"]]
if len(manifest_df) == 0:
return

# write to a SOMA dataframe
with info_collection.add_new_dataframe(
Expand Down
80 changes: 55 additions & 25 deletions tools/cell_census_builder/experiment_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Dict, Generator, List, Optional, Sequence, Tuple, TypedDict, Union, overload

import anndata
import attrs
import numpy as np
import numpy.typing as npt
import pandas as pd
Expand All @@ -15,6 +16,7 @@
import tiledbsoma as soma
from scipy import sparse
from somacore.options import OpenMode
from typing_extensions import Self

from .anndata import AnnDataFilterSpec, make_anndata_cell_filter, open_anndata
from .datasets import Dataset
Expand Down Expand Up @@ -65,25 +67,58 @@ def _assert_open_for_write(obj: somacore.SOMAObject) -> None:
assert not obj.closed


class ExperimentBuilder:
@attrs.define(frozen=True)
class ExperimentSpecification:
"""
Class to help build a parameterized SOMA experiment, where key parameters are:
Declarative "specification" of a SOMA experiment. This is a read-only
specification, independent of the datasets used to build the census.
Parameters:
* experiment "name" (eg, 'human'), must be unique in all experiments.
* an AnnData filter used to cherry pick data for the experiment
* methods to progressively build the experiment
* external reference data used to build the experiment, e.g., gene length data
The creation and driving of these objects is done by the main loop.
Usage: to create, use the factory method `ExperimentSpecification.create(...)`
"""

name: str
anndata_cell_filter_spec: AnnDataFilterSpec
gene_feature_length_uris: List[str]
gene_feature_length: pd.DataFrame

def __init__(self, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_feature_length_uris: List[str]):
self.name = name
self.anndata_cell_filter_spec = anndata_cell_filter_spec
self.gene_feature_length_uris = gene_feature_length_uris
@classmethod
def create(
cls, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_feature_length_uris: List[str]
) -> Self:
"""Factory method. Do not instantiate the class directly."""
gene_feature_length = cls._load_gene_feature_length(gene_feature_length_uris)
logging.info(f"Loaded gene lengths external reference for {name}, {len(gene_feature_length)} genes.")
return cls(name, anndata_cell_filter_spec, gene_feature_length_uris, gene_feature_length)

@classmethod
def _load_gene_feature_length(cls, gene_feature_length_uris: Sequence[str]) -> pd.DataFrame:
"""
Private. Load any external assets required to create the experiment.
"""
return pd.concat(
pd.read_csv(
io.BytesIO(cat_file(uri)),
names=["feature_id", "feature_name", "gene_version", "feature_length"],
)
.set_index("feature_id")
.drop(columns=["feature_name", "gene_version"])
for uri in gene_feature_length_uris
)


class ExperimentBuilder:
"""
Class that embodies the operators and state to build an Experiment.
The creation and driving of these objects is done by the main loop.
"""

def __init__(self, specification: ExperimentSpecification):
self.specification = specification

# accumulated state
self.n_obs: int = 0
Expand All @@ -98,24 +133,18 @@ def __init__(self, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_
self.experiment_uri: Optional[str] = None # initialized in create()
self.global_var_joinids: Optional[pd.DataFrame] = None
self.presence: Dict[int, Tuple[npt.NDArray[np.bool_], npt.NDArray[np.int64]]] = {}
self.build_completed: bool = False

self.load_assets()
@property
def name(self) -> str:
return self.specification.name

def load_assets(self) -> None:
"""
Load any external assets required to create the experiment.
"""
self.gene_feature_length = pd.concat(
pd.read_csv(
io.BytesIO(cat_file(uri)),
names=["feature_id", "feature_name", "gene_version", "feature_length"],
)
.set_index("feature_id")
.drop(columns=["feature_name", "gene_version"])
for uri in self.gene_feature_length_uris
)
logging.info(f"Loaded gene lengths external reference for {self.name}, {len(self.gene_feature_length)} genes.")
@property
def anndata_cell_filter_spec(self) -> AnnDataFilterSpec:
return self.specification.anndata_cell_filter_spec

@property
def gene_feature_length(self) -> pd.DataFrame:
return self.specification.gene_feature_length

def create(self, census_data: soma.Collection) -> None:
"""Create experiment within the specified Collection with a single Measurement."""
Expand Down Expand Up @@ -260,7 +289,8 @@ def populate_presence_matrix(self, datasets: List[Dataset]) -> None:
"""
_assert_open_for_write(self.experiment)

# SOMA does not currently support empty arrays, so special case this corner-case.
# SOMA does not currently arrays with a zero length domain, so special case this corner-case
# where no data has been read for this experiment.
if len(self.presence) > 0:
# sanity check
assert len(self.presence) == self.n_datasets
Expand Down
5 changes: 3 additions & 2 deletions tools/cell_census_builder/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class Organism:


def get_h5ad(organism: Organism) -> anndata.AnnData:
X = np.random.randint(5, size=(4, 4))
X = np.random.randint(5, size=(4, 4)).astype(np.float32)
# The builder only supports sparse matrices
X = sparse.csr_matrix(X)

Expand All @@ -54,7 +54,8 @@ def get_h5ad(organism: Organism) -> anndata.AnnData:
"sex": "test",
"tissue": "test",
"organism": "test",
}
},
index=["1", "2", "3", "4"],
)
obs = obs_dataframe

Expand Down
9 changes: 6 additions & 3 deletions tools/cell_census_builder/tests/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
import tiledb
import tiledbsoma as soma

from tools.cell_census_builder.__main__ import build, make_experiment_builders
from tools.cell_census_builder.__main__ import build, make_experiment_specs
from tools.cell_census_builder.datasets import Dataset
from tools.cell_census_builder.experiment_builder import ExperimentBuilder
from tools.cell_census_builder.globals import (
CENSUS_DATA_NAME,
CENSUS_INFO_NAME,
Expand All @@ -32,7 +33,9 @@ def test_base_builder_creation(
"tools.cell_census_builder.validate.validate_consolidation", return_value=True
):
# Patching consolidate_tiledb_object, becuase is uses to much memory to run in github actions.
experiment_builders = make_experiment_builders()
experiment_specifications = make_experiment_specs()
experiment_builders = [ExperimentBuilder(spec) for spec in experiment_specifications]

from types import SimpleNamespace

args = SimpleNamespace(multi_process=False, consolidate=True, build_tag="test_tag", max_workers=1, verbose=True)
Expand All @@ -42,7 +45,7 @@ def test_base_builder_creation(
assert return_value == 0

# validate the cell_census
return_value = validate(args, soma_path, assets_path, experiment_builders) # type: ignore[arg-type]
return_value = validate(args, soma_path, assets_path, experiment_specifications) # type: ignore[arg-type]
assert return_value is True

# Query the census and do assertions
Expand Down
Loading

0 comments on commit 57972f3

Please sign in to comment.