Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ivirshup/census builder spatial visium cols #1344

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
521 changes: 521 additions & 0 deletions tools/cellxgene_census_builder/manifests/all_spatial_manifest.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,22 @@
from .datasets import Dataset
from .globals import (
ALLOWED_SPATIAL_ASSAYS,
CENSUS_OBS_PLATFORM_CONFIG,
CENSUS_OBS_TABLE_SPEC,
CENSUS_OBS_CORE_FIELDS,
CENSUS_OBS_FIELDS_MAPPED,
CENSUS_OBS_STATS_FIELDS,
CENSUS_VAR_PLATFORM_CONFIG,
CENSUS_VAR_TABLE_SPEC,
CENSUS_X_LAYERS,
CENSUS_X_LAYERS_PLATFORM_CONFIG,
CXG_OBS_COLUMNS_READ,
CXG_VAR_COLUMNS_READ,
DONOR_ID_IGNORE,
FEATURE_DATASET_PRESENCE_MATRIX_NAME,
FULL_GENE_ASSAY,
MEASUREMENT_RNA_NAME,
USE_ARROW_DICTIONARY,
SOMA_TileDB_Context,
)
from .schema_util import TableSpec
from .schema_util import FieldSpec, TableSpec
from .stats import get_obs_stats, get_var_stats
from .summary_cell_counts import (
accumulate_summary_counts,
Expand Down Expand Up @@ -96,24 +97,91 @@ class ExperimentSpecification:

name: str
label: str
root_collection: str
anndata_cell_filter_spec: AnnDataFilterSpec
organism_ontology_term_id: str
obs_term_fields: list[FieldSpec]
obs_term_fields_read: list[FieldSpec]

@classmethod
def create(
cls,
name: str,
label: str,
root_collection: str,
anndata_cell_filter_spec: AnnDataFilterSpec,
organism_ontology_term_id: str,
obs_term_fields: list[FieldSpec],
obs_term_fields_read: list[FieldSpec],
) -> Self:
"""Factory method. Do not instantiate the class directly."""
return cls(name, label, anndata_cell_filter_spec, organism_ontology_term_id)
return cls(
name,
label,
root_collection,
anndata_cell_filter_spec,
organism_ontology_term_id,
obs_term_fields,
obs_term_fields_read,
)

def is_exclusively_spatial(self) -> bool:
"""Returns True if the experiment specification EXCLUSIVELY involves spatial assays."""
return self.anndata_cell_filter_spec["assay_ontology_term_ids"] == ALLOWED_SPATIAL_ASSAYS

@property
def obs_table_spec(self) -> TableSpec:
return TableSpec.create(
CENSUS_OBS_CORE_FIELDS + self.obs_term_fields + CENSUS_OBS_FIELDS_MAPPED + CENSUS_OBS_STATS_FIELDS,
use_arrow_dictionary=USE_ARROW_DICTIONARY,
)

@property
def obs_platform_config(self) -> dict[str, Any]:
"""Materialization (filter pipelines, capacity, etc) of obs/var schema in TileDB is tuned by empirical testing."""
# Numeric cols
_NumericObsAttrs = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
if self.is_exclusively_spatial():
_NumericObsAttrs += ["array_row", "array_col", "in_tissue"]
# Categorical/dict-like columns
_DictLikeObsAttrs = [
f.name
for f in self.obs_table_spec.fields
if isinstance(f, FieldSpec) and f.is_dictionary
if f.is_dictionary and f.name not in (_NumericObsAttrs + ["soma_joinid"])
]
# Dict filter varies depending on whether we are using dictionary types in the schema
_AllOtherObsAttrs = [
f.name
for f in self.obs_table_spec.fields
if f.name not in (_DictLikeObsAttrs + _NumericObsAttrs + ["soma_joinid"])
]
# Dict filter varies depending on whether we are using dictionary types in the schema
_DictLikeFilter: list[Any] = (
[{"_type": "ZstdFilter", "level": 9}]
if USE_ARROW_DICTIONARY
else ["DictionaryFilter", {"_type": "ZstdFilter", "level": 19}]
)

return {
"tiledb": {
"create": {
"capacity": 2**16,
"dims": {"soma_joinid": {"filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}]}},
"attrs": {
**{
k: {"filters": ["ByteShuffleFilter", {"_type": "ZstdFilter", "level": 9}]}
for k in _NumericObsAttrs
},
**{k: {"filters": _DictLikeFilter} for k in _DictLikeObsAttrs},
**{k: {"filters": [{"_type": "ZstdFilter", "level": 19}]} for k in _AllOtherObsAttrs},
},
"offsets_filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}],
"allows_duplicates": True,
}
}
}


class ExperimentBuilder:
"""Class that embodies the operators and state to build an Experiment.
Expand Down Expand Up @@ -169,8 +237,8 @@ def write_obs_dataframe(self) -> None:
assert self.experiment is not None
_assert_open_for_write(self.experiment)

obs_df = CENSUS_OBS_TABLE_SPEC.recategoricalize(self.obs_df)
obs_schema = CENSUS_OBS_TABLE_SPEC.to_arrow_schema(obs_df)
obs_df = self.specification.obs_table_spec.recategoricalize(self.obs_df)
obs_schema = self.specification.obs_table_spec.to_arrow_schema(obs_df)

if obs_df is None or obs_df.empty:
domain = None
Expand All @@ -182,7 +250,7 @@ def write_obs_dataframe(self) -> None:
"obs",
schema=obs_schema,
index_column_names=["soma_joinid"],
platform_config=CENSUS_OBS_PLATFORM_CONFIG,
platform_config=self.specification.obs_platform_config,
domain=domain,
)

Expand Down Expand Up @@ -301,7 +369,7 @@ def get_obs_and_var(
dataset,
base_path=base_path,
filter_spec=spec.anndata_cell_filter_spec,
obs_column_names=CXG_OBS_COLUMNS_READ,
obs_column_names=tuple(field.name for field in spec.obs_term_fields_read),
var_column_names=CXG_VAR_COLUMNS_READ,
) as adata:
logger.debug(f"{dataset.dataset_id}/{spec.name} - found {adata.n_obs} cells")
Expand All @@ -311,7 +379,8 @@ def get_obs_and_var(
logger.debug(f"{spec.name} - H5AD has no data after filtering, skipping {dataset.dataset_id}")
return pd.DataFrame(), pd.DataFrame()

obs_df = adata.obs.copy()
# Converting dtypes to pandas nullables dtypes to account for when datasets don't provide all columns
obs_df = adata.obs.copy().convert_dtypes()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a comment to specify why this is necessary.

obs_df["dataset_id"] = dataset.dataset_id

var_df = (
Expand Down Expand Up @@ -387,7 +456,7 @@ def per_dataset_summary_counts(eb: ExperimentBuilder, obs: pd.DataFrame) -> None

# add columns to be completed later, e.g., summary stats such as mean of X
add_placeholder_columns(
obs, CENSUS_OBS_TABLE_SPEC, default={np.int64: np.iinfo(np.int64).min, np.float64: np.nan}
obs, eb.specification.obs_table_spec, default={np.int64: np.iinfo(np.int64).min, np.float64: np.nan}
)
add_placeholder_columns(var, CENSUS_VAR_TABLE_SPEC, default={np.int64: 0})

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import functools

from .experiment_builder import ExperimentBuilder, ExperimentSpecification
from .globals import ALLOWED_SPATIAL_ASSAYS, RNA_SEQ
from .globals import (
ALLOWED_SPATIAL_ASSAYS,
CXG_OBS_FIELDS_READ,
CXG_OBS_TERM_FIELDS,
CXG_OBS_TERM_FIELDS_SPATIAL,
RNA_SEQ,
)


@functools.cache
Expand All @@ -15,39 +21,51 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
ExperimentSpecification.create(
name="homo_sapiens",
label="Homo sapiens",
root_collection="census_data",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:9606",
"assay_ontology_term_ids": RNA_SEQ,
},
organism_ontology_term_id="NCBITaxon:9606",
obs_term_fields=CXG_OBS_TERM_FIELDS,
obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ,
),
ExperimentSpecification.create(
name="mus_musculus",
label="Mus musculus",
root_collection="census_data",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:10090",
"assay_ontology_term_ids": RNA_SEQ,
},
organism_ontology_term_id="NCBITaxon:10090",
obs_term_fields=CXG_OBS_TERM_FIELDS,
obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ,
),
# Experiments for spatial assays
ExperimentSpecification.create(
name="homo_sapiens",
label="Homo sapiens",
root_collection="census_spatial_sequencing",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:9606",
"assay_ontology_term_ids": ALLOWED_SPATIAL_ASSAYS,
},
organism_ontology_term_id="NCBITaxon:9606",
obs_term_fields=CXG_OBS_TERM_FIELDS + CXG_OBS_TERM_FIELDS_SPATIAL,
obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ + CXG_OBS_TERM_FIELDS_SPATIAL,
),
ExperimentSpecification.create(
name="mus_musculus",
label="Mus musculus",
root_collection="census_spatial_sequencing",
anndata_cell_filter_spec={
"organism_ontology_term_id": "NCBITaxon:10090",
"assay_ontology_term_ids": ALLOWED_SPATIAL_ASSAYS,
},
organism_ontology_term_id="NCBITaxon:10090",
obs_term_fields=CXG_OBS_TERM_FIELDS + CXG_OBS_TERM_FIELDS_SPATIAL,
obs_term_fields_read=CXG_OBS_TERM_FIELDS + CXG_OBS_FIELDS_READ + CXG_OBS_TERM_FIELDS_SPATIAL,
),
]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import functools
from typing import Any

import pyarrow as pa
import tiledbsoma as soma
Expand Down Expand Up @@ -82,36 +81,7 @@
# than `string` or `binary`. There is no at-rest difference (TileDB-SOMA encodes both as large),
# but the in-memory Arrow Array indices for string/binary can overflow as cell counts increase.
#
CXG_OBS_TERM_COLUMNS = [ # Columns pulled from the CXG H5AD without modification.
"assay",
"assay_ontology_term_id",
"cell_type",
"cell_type_ontology_term_id",
"development_stage",
"development_stage_ontology_term_id",
"disease",
"disease_ontology_term_id",
"donor_id",
"is_primary_data",
"observation_joinid",
"self_reported_ethnicity",
"self_reported_ethnicity_ontology_term_id",
"sex",
"sex_ontology_term_id",
"suspension_type",
"tissue",
"tissue_ontology_term_id",
"tissue_type",
]
CXG_OBS_COLUMNS_READ: tuple[str, ...] = ( # Columns READ from the CXG H5AD - see open_anndata()
*CXG_OBS_TERM_COLUMNS,
"organism",
"organism_ontology_term_id",
)
CENSUS_OBS_STATS_COLUMNS = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
CENSUS_OBS_FIELDS: list[FieldSpec | tuple[str, pa.DataType]] = [
("soma_joinid", pa.int64()),
FieldSpec(name="dataset_id", type=pa.large_string(), is_dictionary=True),
CXG_OBS_TERM_FIELDS = [ # Columns pulled from the CXG H5AD without modification.
FieldSpec(name="assay", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="assay_ontology_term_id", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="cell_type", type=pa.large_string(), is_dictionary=True),
Expand All @@ -121,8 +91,8 @@
FieldSpec(name="disease", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="disease_ontology_term_id", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="donor_id", type=pa.large_string(), is_dictionary=True),
("is_primary_data", pa.bool_()),
("observation_joinid", pa.large_string()),
FieldSpec(name="is_primary_data", type=pa.bool_(), is_dictionary=False),
FieldSpec(name="observation_joinid", type=pa.large_string(), is_dictionary=False),
FieldSpec(name="self_reported_ethnicity", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="self_reported_ethnicity_ontology_term_id", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="sex", type=pa.large_string(), is_dictionary=True),
Expand All @@ -131,57 +101,34 @@
FieldSpec(name="tissue", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="tissue_ontology_term_id", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="tissue_type", type=pa.large_string(), is_dictionary=True),
]
CXG_OBS_TERM_FIELDS_SPATIAL = [ # Spatial speicific columns
FieldSpec(name="in_tissue", type=pa.int64(), is_dictionary=False),
FieldSpec(name="array_row", type=pa.int64(), is_dictionary=False),
FieldSpec(name="array_col", type=pa.int64(), is_dictionary=False),
]
CENSUS_OBS_FIELDS_MAPPED = [ # Annotation columns created by census builder
FieldSpec(name="tissue_general", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="tissue_general_ontology_term_id", type=pa.large_string(), is_dictionary=True),
("raw_sum", pa.float64()),
("nnz", pa.int64()),
("raw_mean_nnz", pa.float64()),
("raw_variance_nnz", pa.float64()),
("n_measured_vars", pa.int64()),
]
CENSUS_OBS_TABLE_SPEC = TableSpec.create(CENSUS_OBS_FIELDS, use_arrow_dictionary=USE_ARROW_DICTIONARY)

"""
Materialization (filter pipelines, capacity, etc) of obs/var schema in TileDB is tuned by empirical testing.
"""
# Numeric columns
_NumericObsAttrs = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]
# Categorical/dict-like columns
_DictLikeObsAttrs = [
f.name
for f in CENSUS_OBS_FIELDS
if isinstance(f, FieldSpec) and f.is_dictionary
if f.is_dictionary and f.name not in (_NumericObsAttrs + ["soma_joinid"])
CENSUS_OBS_STATS_FIELDS = [ # Columns for stats calculated during build
FieldSpec(name="raw_sum", type=pa.float64(), is_dictionary=False),
FieldSpec(name="nnz", type=pa.int64(), is_dictionary=False),
FieldSpec(name="raw_mean_nnz", type=pa.float64(), is_dictionary=False),
FieldSpec(name="raw_variance_nnz", type=pa.float64(), is_dictionary=False),
FieldSpec(name="n_measured_vars", type=pa.int64(), is_dictionary=False),
]
# Best of the rest
_AllOtherObsAttrs = [
f.name
for f in CENSUS_OBS_TABLE_SPEC.fields
if f.name not in (_DictLikeObsAttrs + _NumericObsAttrs + ["soma_joinid"])
CENSUS_OBS_CORE_FIELDS = [
FieldSpec(name="soma_joinid", type=pa.int64(), is_dictionary=False),
FieldSpec(name="dataset_id", type=pa.large_string(), is_dictionary=True),
]
# Dict filter varies depending on whether we are using dictionary types in the schema
_DictLikeFilter: list[Any] = (
[{"_type": "ZstdFilter", "level": 9}]
if USE_ARROW_DICTIONARY
else ["DictionaryFilter", {"_type": "ZstdFilter", "level": 19}]
)
CENSUS_OBS_PLATFORM_CONFIG = {
"tiledb": {
"create": {
"capacity": 2**16,
"dims": {"soma_joinid": {"filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}]}},
"attrs": {
**{
k: {"filters": ["ByteShuffleFilter", {"_type": "ZstdFilter", "level": 9}]} for k in _NumericObsAttrs
},
**{k: {"filters": _DictLikeFilter} for k in _DictLikeObsAttrs},
**{k: {"filters": [{"_type": "ZstdFilter", "level": 19}]} for k in _AllOtherObsAttrs},
},
"offsets_filters": ["DoubleDeltaFilter", {"_type": "ZstdFilter", "level": 19}],
"allows_duplicates": True,
}
}
}
# These are not actually written, so pyarrow type is not important
CXG_OBS_FIELDS_READ = [ # Columns READ from the CXG H5AD - see open_anndata()
FieldSpec(name="organism", type=pa.large_string(), is_dictionary=True),
FieldSpec(name="organism_ontology_term_id", type=pa.large_string(), is_dictionary=True),
]
# TODO: remove and use FIELDS variable
CENSUS_OBS_STATS_COLUMNS = ["raw_sum", "nnz", "raw_mean_nnz", "raw_variance_nnz", "n_measured_vars"]

CXG_VAR_COLUMNS_READ: tuple[str, ...] = (
"_index",
Expand Down
Loading
Loading