diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..19fb3a8f4 --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +max-line-length = 120 +extend-ignore = + # See https://github.com/PyCQA/pycodestyle/issues/373 + E203, + E402 +exclude = + .git/ + venv/ + tmp/ + .ipynb_checkpoints/ + __pycache__ + diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml new file mode 100644 index 000000000..5685576fb --- /dev/null +++ b/.github/workflows/formatting.yml @@ -0,0 +1,35 @@ +name: Python Linting + +on: + pull_request: + push: + branches: [main] + +jobs: + lint: + runs-on: ubuntu-latest + defaults: + run: + working-directory: . + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.9" + - run: pip install -U pip + - name: Python Black + run: | + python -m pip install black black[jupyter] + python -m black --check --diff . + - name: Python isort + run: | + python -m pip install isort + python -m isort --check --diff . + - name: Python style with flake8[bugbear] + run: | + python -m pip install flake8-bugbear + python -m flake8 . + - name: Check type annotations mypy + run: | + python -m pip install mypy==0.982 types-setuptools types-requests numpy + python -m mypy . diff --git a/.gitignore b/.gitignore index b6e47617d..ea9e0004e 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,7 @@ dmypy.json # Pyre type checker .pyre/ + +# common user tmp directories +tmp +temp diff --git a/README.md b/README.md index 357d71617..1bc2711e5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # CELLxGENE Cell Census -This repository contains documentation and example code related to the Chan Zuckerberg CELLxGENE Cell Census. +**Status**: Unstable, under rapid development + +This repository contains documentation and example code related to the Chan Zuckerberg CELLxGENE Cell Census, and a client (API) package to simplify accessing the Cell Census data. The CZ Cell Census is an aggregation of all public single cell data available in [CELLxGENE Discover](https://cellxgene.cziscience.com/), published in API-accessible formats, including the [SOMA API](https://github.com/single-cell-data/). diff --git a/api/python/cell_census/LICENSE b/api/python/cell_census/LICENSE new file mode 100644 index 000000000..9b6892998 --- /dev/null +++ b/api/python/cell_census/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Chan Zuckerberg Initiative + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/api/python/cell_census/REAMDE.md b/api/python/cell_census/REAMDE.md new file mode 100644 index 000000000..0509228ca --- /dev/null +++ b/api/python/cell_census/REAMDE.md @@ -0,0 +1,9 @@ +The `cell_census` package provides an API to facilitate use of the CZI Science Cell Census. + +**Status**: Unstable, under rapid development + +For more information, see the [cell_census repo](https://github.com/chanzuckerberg/cell-census/)### For More Help + +For more help, please file a issue on the repo, or contact us at + +If you believe you have found a security issue, we would appreciate notification. Please send email to . diff --git a/api/python/cell_census/setup.cfg b/api/python/cell_census/setup.cfg new file mode 100644 index 000000000..02c159d18 --- /dev/null +++ b/api/python/cell_census/setup.cfg @@ -0,0 +1,27 @@ +[metadata] +name = cell_census +version = attr: cell_census.__version__ +author = Chan Zuckerberg Initiative +author_email = cellxgene@chanzuckerberg.com +description = API to simplify use of the CZI Science CELLxGENE Cell Census +long_description = file: README.md LICENSE +license = MIT +url = https://github.com/chanzuckerberg/cell-census + +[options] +python_requires = >= 3.8 +install_requires = + numba + numpy + requests + tiledb + tiledbsoma + typing_extensions + s3fs + scikit-misc +package_dir= + =src +packages=find: + +[options.packages.find] +where=src diff --git a/api/python/cell_census/setup.py b/api/python/cell_census/setup.py new file mode 100644 index 000000000..7f1a1763c --- /dev/null +++ b/api/python/cell_census/setup.py @@ -0,0 +1,4 @@ +from setuptools import setup + +if __name__ == "__main__": + setup() diff --git a/api/python/cell_census/src/cell_census/__init__.py b/api/python/cell_census/src/cell_census/__init__.py new file mode 100644 index 000000000..29eca5a04 --- /dev/null +++ b/api/python/cell_census/src/cell_census/__init__.py @@ -0,0 +1,14 @@ +from .get_anndata import get_anndata +from .open import download_source_h5ad, get_source_h5ad_uri, open_soma +from .release_directory import get_directory, get_release_description + +__version__ = "0.0.1-dev0" + +__all__ = [ + "download_source_h5ad", + "get_anndata", + "get_directory", + "get_source_h5ad_uri", + "get_release_description", + "open_soma", +] diff --git a/api/python/cell_census/src/cell_census/compute/__init__.py b/api/python/cell_census/src/cell_census/compute/__init__.py new file mode 100644 index 000000000..1a6326ed6 --- /dev/null +++ b/api/python/cell_census/src/cell_census/compute/__init__.py @@ -0,0 +1,7 @@ +from .highly_variable_genes import highly_variable_genes +from .meanvar import OnlineMatrixMeanVariance + +__all__ = [ + "highly_variable_genes", + "OnlineMatrixMeanVariance", +] diff --git a/api/python/cell_census/src/cell_census/compute/highly_variable_genes.py b/api/python/cell_census/src/cell_census/compute/highly_variable_genes.py new file mode 100644 index 000000000..18a400158 --- /dev/null +++ b/api/python/cell_census/src/cell_census/compute/highly_variable_genes.py @@ -0,0 +1,94 @@ +import numpy as np +import pandas as pd + +from ..experiment_query import ExperimentQuery +from .meanvar import OnlineMatrixMeanVariance + + +def highly_variable_genes(query: ExperimentQuery, n_top_genes: int = 10) -> pd.DataFrame: + """ + Acknowledgements: scanpy highly variable genes implementation, github.com/scverse/scanpy + """ + use_prefetch = True + + try: + import skmisc.loess + except ImportError: + raise ImportError("Please install skmisc package via `pip install --user scikit-misc") + + indexer = query.get_indexer() + mvn = OnlineMatrixMeanVariance(query.n_obs, query.n_vars) + for arrow_tbl in query.X("raw", prefetch=use_prefetch): + var_dim = indexer.var_index(arrow_tbl["soma_dim_1"]) + data = arrow_tbl["soma_data"].to_numpy() + mvn.update(var_dim, data) + + u, v = mvn.finalize() + var_df = pd.DataFrame( + index=pd.Index(data=query.var_joinids(), name="soma_joinid"), + data={ + "means": u, + "variances": v, + }, + ) + + estimated_variances = np.zeros((len(var_df),), dtype=np.float64) + not_const = v > 0 + y = np.log10(v[not_const]) + x = np.log10(u[not_const]) + model = skmisc.loess.loess(x, y, span=0.3, degree=2) + model.fit() + estimated_variances[not_const] = model.outputs.fitted_values + reg_std = np.sqrt(10**estimated_variances) + + # A second pass over the data is required because the clip value + # is determined by the first pass + N = query.n_obs + vmax = np.sqrt(N) + clip_val = reg_std * vmax + u + counts_sum = np.zeros((query.n_vars,), dtype=np.float64) # clipped + squared_counts_sum = np.zeros((query.n_vars,), dtype=np.float64) # clipped + for arrow_tbl in query.X("raw", prefetch=use_prefetch): + var_dim = indexer.var_index(arrow_tbl["soma_dim_1"]) + data = arrow_tbl["soma_data"].to_numpy() + # clip + mask = data > clip_val[var_dim] + data = data.copy() + data[mask] = clip_val[var_dim[mask]] + np.add.at(counts_sum, var_dim, data) + np.add.at(squared_counts_sum, var_dim, data**2) + + norm_gene_vars = (1 / ((N - 1) * np.square(reg_std))) * ( + (N * np.square(u)) + squared_counts_sum - 2 * counts_sum * u + ) + norm_gene_vars = norm_gene_vars.reshape(1, -1) + + # argsort twice gives ranks, small rank means most variable + ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1), axis=1) + + # this is done in SelectIntegrationFeatures() in Seurat v3 + ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32) + num_batches_high_var = np.sum((ranked_norm_gene_vars < n_top_genes).astype(int), axis=0) + ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan + ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars) # type: ignore + median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan) # type: ignore + + var_df = var_df.assign( + highly_variable_nbatches=pd.Series(num_batches_high_var, index=var_df.index), + highly_variable_rank=pd.Series(median_ranked, index=var_df.index), + variances_norm=pd.Series(np.mean(norm_gene_vars, axis=0), index=var_df.index), + ) + + sorted_index = ( + var_df[["highly_variable_rank", "highly_variable_nbatches"]] + .sort_values( + ["highly_variable_rank", "highly_variable_nbatches"], + ascending=[True, False], + na_position="last", + ) + .index + ) + var_df["highly_variable"] = False + var_df = var_df.drop(columns=["highly_variable_nbatches"]) + var_df.loc[sorted_index[: int(n_top_genes)], "highly_variable"] = True + return var_df diff --git a/api/python/cell_census/src/cell_census/compute/meanvar.py b/api/python/cell_census/src/cell_census/compute/meanvar.py new file mode 100644 index 000000000..722b12ac2 --- /dev/null +++ b/api/python/cell_census/src/cell_census/compute/meanvar.py @@ -0,0 +1,76 @@ +import numba +import numpy as np +import numpy.typing as npt + + +class OnlineMatrixMeanVariance: + n_samples: int + n_variables: int + + def __init__(self, n_samples: int, n_variables: int): + """ + Compute mean and variance for n_variables over n_samples, encoded + in a COO format. Equivalent to: + numpy.mean(data, axis=0) + numpy.var(data, axix=0) + where the input `data` is of shape (n_samples, n_variables) + """ + self.n_samples = n_samples + self.n_variables = n_variables + + self.n_a = np.zeros((n_variables,), dtype=np.int32) + self.u_a = np.zeros((n_variables,), dtype=np.float64) + self.M2_a = np.zeros((n_variables,), dtype=np.float64) + + def update(self, coord_vec: npt.NDArray[np.int64], value_vec: npt.NDArray[np.float32]) -> None: + _mean_variance_update(coord_vec, value_vec, self.n_a, self.u_a, self.M2_a) + + def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: + """ + Returns tuple containing mean and variance + """ + u, M2 = _mean_variance_finalize(self.n_samples, self.n_a, self.u_a, self.M2_a) + + # compute sample variance + var = M2 / max(1, (self.n_samples - 1)) + + return u, var + + +# TODO: add type signatures to annotation, removing need to do dynamic generation + + +@numba.jit(nopython=True, nogil=True) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 +def _mean_variance_update( + col_arr: npt.NDArray[np.int64], + val_arr: npt.NDArray[np.float32], + n: npt.NDArray[np.int32], + u: npt.NDArray[np.float64], + M2: npt.NDArray[np.float64], +) -> None: + """ + Incrementally accumulate mean and sum of square of distance from mean using + Welford's online method. + """ + for col, val in zip(col_arr, val_arr): + u_prev = u[col] + M2_prev = M2[col] + n[col] += 1 + u[col] = u_prev + (val - u_prev) / n[col] + M2[col] = M2_prev + (val - u_prev) * (val - u[col]) + + +@numba.jit(nopython=True, nogil=True) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 +def _mean_variance_finalize( + n_samples: int, n_a: npt.NDArray[np.int32], u_a: npt.NDArray[np.float64], M2_a: npt.NDArray[np.float64] +) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: + """ + Finalize incremental values, acconting for missing elements (due to sparse input). + Non-sparse and sparse combined using Chan's parallel adaptation of Welford's. + The code assumes the sparse elements are all zero and ignores those terms. + """ + n_b = n_samples - n_a + delta = -u_a # assumes u_b == 0 + u = (n_a * u_a) / n_samples + M2 = M2_a + delta**2 * n_a * n_b / n_samples # assumes M2_b == 0 + return u, M2 diff --git a/api/python/cell_census/src/cell_census/experiment_query/__init__.py b/api/python/cell_census/src/cell_census/experiment_query/__init__.py new file mode 100644 index 000000000..09a371d9a --- /dev/null +++ b/api/python/cell_census/src/cell_census/experiment_query/__init__.py @@ -0,0 +1,12 @@ +from .axis import AxisQuery +from .query import ExperimentQuery, experiment_query +from .types import AxisColumnNames +from .util import X_as_series + +__all__ = [ + "experiment_query", + "AxisColumnNames", + "AxisQuery", + "ExperimentQuery", + "X_as_series", +] diff --git a/api/python/cell_census/src/cell_census/experiment_query/anndata.py b/api/python/cell_census/src/cell_census/experiment_query/anndata.py new file mode 100644 index 000000000..b33c5ba68 --- /dev/null +++ b/api/python/cell_census/src/cell_census/experiment_query/anndata.py @@ -0,0 +1,35 @@ +from typing import Dict, Tuple + +import anndata +import pyarrow as pa +import scipy.sparse as sparse + +from .types import ExperimentQueryReadArrowResult + + +def arrow_to_scipy_csr(X: pa.Table, shape: Tuple[int, int]) -> sparse.csr_matrix: + return sparse.csr_matrix((X["soma_data"].to_numpy(), (X["_dim_0"].to_numpy(), X["_dim_1"].to_numpy())), shape=shape) + + +def make_anndata(query_result: ExperimentQueryReadArrowResult) -> anndata.AnnData: + + obs = query_result["obs"] + obs = obs.to_pandas() + obs.index = obs.index.map(str) + + var = query_result["var"] + var = var.to_pandas() + var.index = var.index.map(str) + + shape = (len(obs), len(var)) + + X = query_result.get("X", None) + if X is not None: + X = arrow_to_scipy_csr(X, shape) + + X_layers = query_result.get("X_layers", {}) + layers: Dict[str, sparse.csr_matrix] = {} + for X_layer_name, X_layer_table in X_layers.items(): + layers[X_layer_name] = arrow_to_scipy_csr(X_layer_table, shape) + + return anndata.AnnData(X=X, obs=obs, var=var, layers=(layers if len(layers) else None)) diff --git a/api/python/cell_census/src/cell_census/experiment_query/axis.py b/api/python/cell_census/src/cell_census/experiment_query/axis.py new file mode 100644 index 000000000..15619b074 --- /dev/null +++ b/api/python/cell_census/src/cell_census/experiment_query/axis.py @@ -0,0 +1,64 @@ +from dataclasses import dataclass +from typing import Optional, Tuple, TypedDict, Union + +import numpy as np +import numpy.typing as npt +import pyarrow as pa + +# Type declaration/helpers local to this file +# +Coordinates = Tuple[Union[slice, int, npt.ArrayLike], ...] +ValueFilter = str + +MatrixAxisQuery = TypedDict( + "MatrixAxisQuery", + { + "obs": "AxisQuery", + "var": "AxisQuery", + }, +) + + +@dataclass() +class AxisQuery: + """ + Define a single-axis dataframe query based upon either a value filter predicate or coordinates. + + Can have value: + * None - no query, ie, all data + * Coordinates - a set of coordinates on the axis dataframe index (or soma_rowids if a dense dataframe) + * A SOMA `value_filter` across columns in the axis dataframe + + Examples: + ``` + AxisQuery() + AxisQuery(coords=[0,1,2]) + AxisQuery(value_filter="tissue == 'lung'") + ``` + """ + + value_filter: Optional[str] = None + coords: Optional[Coordinates] = None + + def __post_init__(self) -> None: + # TODO: Error class + if not (self.value_filter is None) != (self.coords is None): + raise Exception("FilterSpec - value_filter or coords may be specified, but not both.") + + if self.value_filter is not None: + # If a a value_filter, default to all coords + self.coords = (slice(None),) + else: + if not isinstance(self.coords, tuple): + raise Exception("FilterSpec - coords must be tuple of int, slice or numpy.array_like") + coords = [] + for c in self.coords: + if isinstance(c, int) or isinstance(c, slice): + coords.append(c) + else: + coords.append(pa.array(np.array(c, dtype=np.int64))) + self.coords = tuple(coords) + + def is_value_filter(self) -> bool: + """Return True if this is a value filter, else False if coordinates""" + return self.value_filter is not None diff --git a/api/python/cell_census/src/cell_census/experiment_query/query.py b/api/python/cell_census/src/cell_census/experiment_query/query.py new file mode 100644 index 000000000..58b560401 --- /dev/null +++ b/api/python/cell_census/src/cell_census/experiment_query/query.py @@ -0,0 +1,459 @@ +import asyncio +import concurrent.futures +import contextvars +import functools +import inspect +from contextlib import contextmanager +from typing import ( + AsyncIterator, + Callable, + Generator, + Iterator, + List, + Literal, + Optional, + Sequence, + TypedDict, + TypeVar, + Union, + cast, +) + +import anndata +import numpy as np +import numpy.typing as npt +import pandas as pd +import pyarrow as pa +import tiledbsoma as soma +from typing_extensions import ParamSpec + +from .anndata import make_anndata +from .axis import AxisQuery, MatrixAxisQuery +from .types import AxisColumnNames, ExperimentQueryReadArrowResult + +AxisJoinIds = TypedDict( + "AxisJoinIds", + { + "obs": pa.Array, + "var": pa.Array, + }, +) + + +class ExperimentQuery: + """ + This is a prototype. + + ExperimentQuery allows easy selection and extraction of data from a single soma.Measurement + in a soma.Experiment. + + IMPORTANT: this class is not thread safe. + + IMPORTANT: this query class assumes it can store the full result of both axis dataframe + queries in memory, and only provides incremental access to the underlying X NdArray. API + features such as `n_obs` and `n_vars` codify this in the API. + + IMPORTANT: you must call `close()` on any instance of this class in order to release + underlying resources. It is strongly suggested that the context manager `experiment_query` + is used to make this easy/safe. + + TODO: see chanzuckerberg/soma-scratch#9 + """ + + experiment: soma.Experiment + ms: str + _query: MatrixAxisQuery + _joinids: AxisJoinIds + _indexer: "AxisIndexer" + _default_threadpool: concurrent.futures.ThreadPoolExecutor + + def __init__( + self, + experiment: soma.Experiment, + measurement_name: str, + *, + obs_query: Optional[AxisQuery] = None, + var_query: Optional[AxisQuery] = None, + ): + if not experiment.exists(): + raise ValueError("Experiment does not exist") + if measurement_name not in experiment.ms: + raise ValueError("Measurement does not exist in the experiment") + + self.experiment = experiment + self.ms = measurement_name + + self._query = { + "obs": obs_query if obs_query is not None else AxisQuery(coords=(slice(None),)), + "var": var_query if var_query is not None else AxisQuery(coords=(slice(None),)), + } + self._joinids = { + "obs": None, + "var": None, + } + self._indexer = AxisIndexer(self) + + # TODO: user should be able to set this, a la asyncio loop.set_default_executor() + self._default_threadpool = concurrent.futures.ThreadPoolExecutor() + + def close(self) -> None: + """ + Cleanup and close all resources. This must be called or the thread pool + will not be release, etd. + """ + self._default_threadpool.shutdown() + + def _read_axis_dataframe( + self, + axis: Literal["obs", "var"], + axis_df: soma.DataFrame, + *, + column_names: Optional[Sequence[str]], + ) -> pa.Table: + """ + Read the specified axis. Will load and save the resulting soma_joinids for that + axis if they are not already know. + """ + query = self._query[axis] + need_joinids = self._joinids[axis] is None + + query_columns = column_names + if need_joinids and column_names is not None and "soma_joinid" not in column_names: + query_columns = ["soma_joinid"] + list(column_names) + + tbl = axis_df.read_all(ids=query.coords, value_filter=query.value_filter, column_names=query_columns) + + if need_joinids: + self._joinids[axis] = tbl.column("soma_joinid").combine_chunks() + assert self._joinids[axis] is not None + + if column_names is not None: + tbl = tbl.select(column_names) + return tbl + + def _read_axis_joinids(self, axis: Literal["obs", "var"], axis_df: soma.DataFrame) -> pa.Array: + if self._joinids[axis] is None: + self._read_axis_dataframe(axis, axis_df, column_names=["soma_joinid"]) + return self._joinids[axis] + + def obs(self, *, column_names: Optional[Sequence[str]] = None) -> pa.Table: + """Return obs as an Arrow table.""" + return self._read_axis_dataframe("obs", self.experiment.obs, column_names=column_names) + + def var(self, *, column_names: Optional[Sequence[str]] = None) -> pa.Table: + """Return var as an Arrow table.""" + return self._read_axis_dataframe("var", self.experiment.ms[self.ms].var, column_names=column_names) + + def obs_joinids(self) -> pa.Array: + return self._read_axis_joinids("obs", self.experiment.obs) + + def var_joinids(self) -> pa.Array: + return self._read_axis_joinids("var", self.experiment.ms[self.ms].var) + + @property + def n_obs(self) -> int: + return len(self.obs_joinids()) + + @property + def n_vars(self) -> int: + return len(self.var_joinids()) + + def _fetchX(self, X: soma.SparseNdArray, prefetch: bool = False) -> Iterator[pa.Table]: + assert self._joinids["obs"] is not None + assert self._joinids["var"] is not None + + obs_joinids = self._joinids["obs"] + var_joinids = self._joinids["var"] + + if len(obs_joinids) == 0 or len(var_joinids) == 0: + return pa.Table.from_pylist([], schema=X.schema) + + if not prefetch: + # yield for clarity + yield from cast(Iterator[pa.Table], X.read_table((obs_joinids, var_joinids))) + + else: + # prefetch + fn = wrap_generator(X.read_table((obs_joinids, var_joinids))) + _prefetch_future = self._default_threadpool.submit(fn) + while True: + value, done = _prefetch_future.result() + if done: + return + assert value is not None + _prefetch_future = self._default_threadpool.submit(fn) + yield value + + def X(self, layer: str, prefetch: bool = False) -> Iterator[pa.Table]: + """ + Return X as an iterator of Arrow Tables. + """ + if not layer: + raise ValueError("Must specify X layer") + if layer not in self.experiment.ms[self.ms].X: + raise ValueError("Unknown X layer") + + X = self.experiment.ms[self.ms].X[layer] + if X.soma_type != "SOMASparseNdArray": + raise NotImplementedError("Dense array unsupported") + + futures = [] + if not self._joinids["obs"]: + futures.append(self._default_threadpool.submit(self.obs_joinids)) + if not self._joinids["var"]: + futures.append(self._default_threadpool.submit(self.var_joinids)) + if futures: + concurrent.futures.wait(futures) + + yield from self._fetchX(X, prefetch=prefetch) + + def read( + self, + X_name: str, + *, + use_position_indexing: bool = False, + column_names: Optional[AxisColumnNames] = None, + X_layers: Optional[List[str]] = None, + ) -> ExperimentQueryReadArrowResult: + """ + Read the _entire_ query result into Arrow Tables. Low-level routine + intended to be the basis for exporting to other in-core formats, such + as AnnData. + """ + X_collection = self.experiment.ms[self.ms].X + X_layers = [] if X_layers is None else X_layers + all_X_names = [X_name] + X_layers + for _xname in all_X_names: + if not isinstance(_xname, str) or not _xname: + raise ValueError("X layer names must be specified as a string.") + if _xname not in X_collection: + raise ValueError("Unknown X layer name") + # TODO: dense array slicing + if X_collection[_xname].soma_type != "SOMASparseNdArray": + raise NotImplementedError("Dense array unsupported") + + if column_names is None: + column_names = {"obs": None, "var": None} + if "obs" not in column_names: + column_names["obs"] = None + if "var" not in column_names: + column_names["var"] = None + + futures = ( + self._default_threadpool.submit( + self._read_axis_dataframe, "obs", self.experiment.obs, column_names=column_names["obs"] + ), + self._default_threadpool.submit( + self._read_axis_dataframe, "var", self.experiment.ms[self.ms].var, column_names=column_names["var"] + ), + ) + concurrent.futures.wait(futures) + obs_table, var_table = (f.result() for f in futures) + + X_tables = { + _xname: pa.concat_tables(self._fetchX(X_collection[_xname], prefetch=True)) for _xname in all_X_names + } + if use_position_indexing: + X_tables = self._rewrite_X_for_positional_indexing(X_tables) + + X = X_tables.pop(X_name) + query_result: ExperimentQueryReadArrowResult = {"obs": obs_table, "var": var_table, "X": X} + if len(X_layers) > 0: + assert len(X_layers) == len(X_tables) + query_result["X_layers"] = X_tables + + return query_result + + def read_as_anndata( + self, + X_name: str, + *, + column_names: Optional[AxisColumnNames] = None, + X_layers: Optional[List[str]] = None, + ) -> anndata.AnnData: + """ + Execute the query and return result as an AnnData in-memory object. + """ + query_result = self.read(X_name, column_names=column_names, X_layers=X_layers, use_position_indexing=True) + return make_anndata(query_result) + + def _rewrite_X_for_positional_indexing(self, X_tables: dict[str, pa.Table]) -> dict[str, pa.Table]: + """ + This is a private convenience function to convert axis dataframe to X matrix joins + from `soma_joinid`-based joins to positionally indexed joins (like AnnData uses). + + Input is organized as: + obs[i] annotates X[ obs[i].soma_joinid, : ] + and + var[j] annotates X[ :, var[j].soma_joinid ] + + Output is organized as: + obs[i] annotates X[i, :] + and + var[j] annotates X[:, j] + + In addition, the `soma_joinid` column is dropped from the axis dataframes. + """ + new_X_tables = {} + indexer = self.get_indexer() + for X_name, X_table in X_tables.items(): + new_X_tables[X_name] = pa.Table.from_arrays( + ( + indexer.obs_index(X_table["soma_dim_0"]), + indexer.var_index(X_table["soma_dim_1"]), + X_table["soma_data"].to_numpy(), # as a side effect, consolidates chunks + ), + names=("_dim_0", "_dim_1", "soma_data"), + ) + return new_X_tables + + def get_async(self) -> "AsyncExperimentQuery": + return AsyncExperimentQuery(self) + + def get_indexer(self) -> "AxisIndexer": + return self._indexer + + +@contextmanager +def experiment_query( + experiment: soma.Experiment, + measurement_name: str, + *, + obs_query: Optional[AxisQuery] = None, + var_query: Optional[AxisQuery] = None, +) -> Iterator[ExperimentQuery]: + """ + Context manager which simplifies use of the query by ensuring that + query.close() is called. + """ + query = ExperimentQuery(experiment, measurement_name, obs_query=obs_query, var_query=var_query) + yield query + query.close() + + +class AsyncExperimentQuery: + """ + An async proxy for ExperimentQuery, allowing use with coroutines + """ + + query: ExperimentQuery + + def __init__(self, query: ExperimentQuery): + self.query = query + + def close(self) -> None: + self.query.close() + + @property + def n_obs(self) -> int: + return self.query.n_obs + + @property + def n_vars(self) -> int: + return self.query.n_vars + + async def obs(self, *, column_names: Optional[Sequence[str]] = None) -> AsyncIterator[pa.Table]: + return await to_thread(self.query.obs, column_names=column_names) + + async def var(self, *, column_names: Optional[Sequence[str]] = None) -> AsyncIterator[pa.Table]: + return await to_thread(self.query.var, column_names=column_names) + + async def obs_joinids(self) -> pa.Array: + if self.query._joinids["obs"] is not None: + return self.query._joinids["obs"] + return await to_thread(self.query.obs_joinids) + + async def var_joinids(self) -> pa.Array: + if self.query._joinids["var"] is not None: + return self.query._joinids["var"] + return await to_thread(self.query.var_joinids) + + async def X(self, layer: str, prefetch: bool = False) -> AsyncIterator[pa.Table]: + chunk: pa.Table + async for chunk in async_iter((i for i in self.query.X(layer, prefetch))): + yield chunk + + +T = TypeVar("T") + + +async def async_iter(gen: Generator[T, None, None]) -> AsyncIterator[T]: + """ + Convert a generator into an async coroutine + """ + fn = wrap_generator(gen) + while True: + value, done = await to_thread(fn) + if done: + return + assert value is not None + yield value + + +def wrap_generator(gen: Generator[T, None, None]) -> Callable[[], tuple[Optional[T], bool]]: + """ + Wrap a generator, making it a "normal" function that is amenable + to running in a thread. Each time it is called, it returns a + tuple: + If there is another value: (next_value, False) + If end of iteration: (None, True) + """ + assert inspect.isgenerator(gen) + + def _next() -> tuple[Optional[T], bool]: + try: + value = next(gen) + return value, False + except StopIteration: + return None, True + + return _next + + +_P = ParamSpec("_P") +_R = TypeVar("_R") + + +async def to_thread(__func: Callable[_P, _R], *args: _P.args, **kwargs: _P.kwargs) -> _R: + """ + Reimplementation of asyncio.to_thread, which was introduced in Py 3.9. Added + here for support on earlier versions of Python. + + See https://docs.python.org/3/library/asyncio-task.html#asyncio.to_thread + """ + loop = asyncio.events.get_running_loop() + ctx = contextvars.copy_context() + func_call = cast(Callable[..., _R], functools.partial(ctx.run, __func, *args, **kwargs)) + return await loop.run_in_executor(None, func_call) + + +class AxisIndexer: + """ + Given a query, providing index-bulding services for obs/var axis. + """ + + query: ExperimentQuery + _obs_index: pd.Index + _var_index: pd.Index + + def __init__(self, query: Union[ExperimentQuery, AsyncExperimentQuery]): + if isinstance(query, AsyncExperimentQuery): + query = query.query + + self.query = query + self._obs_index = None + self._var_index = None + + def obs_index(self, coords: Union[pa.Array, pa.ChunkedArray, npt.NDArray[np.int64]]) -> npt.NDArray[np.intp]: + if not isinstance(coords, np.ndarray): + coords = coords.to_numpy() + if self._obs_index is None: + self._obs_index = pd.Index(data=self.query.obs_joinids().to_numpy()) + return cast(npt.NDArray[np.intp], self._obs_index.get_indexer(coords)) + + def var_index(self, coords: Union[pa.Array, pa.ChunkedArray, npt.NDArray[np.int64]]) -> npt.NDArray[np.intp]: + if not isinstance(coords, np.ndarray): + coords = coords.to_numpy() + if self._var_index is None: + self._var_index = pd.Index(data=self.query.var_joinids().to_numpy()) + return cast(npt.NDArray[np.intp], self._var_index.get_indexer(coords)) diff --git a/api/python/cell_census/src/cell_census/experiment_query/types.py b/api/python/cell_census/src/cell_census/experiment_query/types.py new file mode 100644 index 000000000..6ce9c57ed --- /dev/null +++ b/api/python/cell_census/src/cell_census/experiment_query/types.py @@ -0,0 +1,32 @@ +""" +Types global to this module +""" +from typing import Optional, Sequence, TypedDict + +import pandas as pd +import pyarrow as pa + +# Sadly, you can't define a generic TypedDict.... + + +class ExperimentQueryReadArrowResult(TypedDict, total=False): + obs: pa.Table + var: pa.Table + X: pa.Table + X_layers: dict[str, pa.Table] + + +class ExperimentQueryReadPandasResult(TypedDict, total=False): + obs: pd.DataFrame + var: pd.DataFrame + X: pd.DataFrame + X_layers: dict[str, pd.DataFrame] + + +AxisColumnNames = TypedDict( + "AxisColumnNames", + { + "obs": Optional[Sequence[str]], # None is all + "var": Optional[Sequence[str]], + }, +) diff --git a/api/python/cell_census/src/cell_census/experiment_query/util.py b/api/python/cell_census/src/cell_census/experiment_query/util.py new file mode 100644 index 000000000..fd7471a49 --- /dev/null +++ b/api/python/cell_census/src/cell_census/experiment_query/util.py @@ -0,0 +1,19 @@ +import pandas as pd +import pyarrow as pa + + +def X_as_series(tbl: pa.Table) -> pd.Series: + """ + Convert SOMA 2D data from Arrow Table to Pandas Series. + + NOTE: this is not zero copy. + """ + data = tbl["soma_data"].to_numpy() + dim_0 = tbl["soma_dim_0"].to_numpy() + dim_1 = tbl["soma_dim_1"].to_numpy() + return pd.Series( + data, + pd.MultiIndex.from_arrays((dim_0, dim_1), names=("soma_dim_0", "soma_dim_1")), + dtype=pd.SparseDtype(data.dtype, fill_value=0), + name="soma_data", + ) diff --git a/api/python/cell_census/src/cell_census/get_anndata.py b/api/python/cell_census/src/cell_census/get_anndata.py new file mode 100644 index 000000000..3701b39d7 --- /dev/null +++ b/api/python/cell_census/src/cell_census/get_anndata.py @@ -0,0 +1,119 @@ +import re +from typing import List, Optional, TypedDict, Union + +import anndata +import tiledbsoma as soma + +from .experiment_query import AxisColumnNames, AxisQuery, experiment_query + +ObsQuery = TypedDict( + "ObsQuery", + { + "assay": Optional[Union[str, List[str]]], + "assay_ontology_term_id": Optional[Union[str, List[str]]], + "cell_type": Optional[Union[str, List[str]]], + "cell_type_ontology_term_id": Optional[Union[str, List[str]]], + "development_stage": Optional[Union[str, List[str]]], + "development_stage_ontology_term_id": Optional[Union[str, List[str]]], + "disease": Optional[Union[str, List[str]]], + "disease_ontology_term_id": Optional[Union[str, List[str]]], + "donor_id": Optional[Union[str, List[str]]], + "is_primary_data": Optional[bool], + "self_reported_ethnicity": Optional[Union[str, List[str]]], + "self_reported_ethnicity_ontology_term_id": Optional[Union[str, List[str]]], + "sex": Optional[Union[str, List[str]]], + "sex_ontology_term_id": Optional[Union[str, List[str]]], + "suspension_type": Optional[Union[str, List[str]]], + "tissue": Optional[Union[str, List[str]]], + "tissue_ontology_term_id": Optional[Union[str, List[str]]], + }, +) + +VarQuery = TypedDict( + "VarQuery", + { + "feature_id": Optional[Union[str, List[str]]], + "feature_name": Optional[Union[str, List[str]]], + }, +) + + +def _build_query(query_defn: Optional[Union[ObsQuery, VarQuery]] = None) -> Optional[AxisQuery]: + """ + Build a AxisQuery value filter from the user-defined query parameters. + """ + if query_defn is None: + return None + + query_conditions = [] + for name, val in query_defn.items(): + if isinstance(val, str): + query_conditions.append(f"{name} == '{val}'") + elif isinstance(val, list): + query_conditions.append(f"{name} in {val}") + else: + raise TypeError("Query must be string or list of strings") + + if len(query_conditions) == 0: + return None + + return AxisQuery(value_filter=" and ".join(query_conditions)) + + +def get_anndata( + census: soma.Collection, + organism: str, + measurement_name: str = "RNA", + X_name: str = "raw", + obs_query: Optional[ObsQuery] = None, + var_query: Optional[VarQuery] = None, + column_names: Optional[AxisColumnNames] = None, +) -> anndata.AnnData: + """ + Convience wrapper around soma.Experiment query, to build and execute a query, + and return it as an AnnData object. + + Parameters + ---------- + census : soma.Collection + The census object, usually returned by `cell_census.open_soma()` + organism : str + The organism to query, usually one of "Homo sapiens" or "Mus musculus" + measurement_name : str, default 'RNA' + The measurement object to query + X_name : str, default "raw" + The X layer to query + obs_query : dict[str, Union[str, List[str]]] + Obs (cell) query definition. Dict where keys are column names, and value is a + string or list of strings to match. All query terms must match (AND query). + var_query : dict[str, Union[str, List[str]]] + Var (gene) query definition. Dict where keys are column names, and value is a + string or list of strings to match. All query terms must match (AND query). + column_names: dict[Literal['obs', 'var'], List[str]] + Colums to fetch for obs and var dataframes. + + Returns + ------- + anndata.AnnData - containing the census slice + + Examples + -------- + >>> get_anndata(census, "Mus musculus", obs_query={"tissue": "brain"}) + + >>> get_anndata(census, "Homo sapiens", column_names={"obs": ["tissue"]}) + + """ + + # lower/snake case the organism name to find the experiment name + exp_name = re.sub(r"[ ]+", "_", organism).lower() + + if exp_name not in census["census_data"]: + raise ValueError(f"Unknown organism {organism} - does not exist") + exp = census["census_data"][exp_name] + if exp.soma_type != "SOMAExperiment": + raise ValueError(f"Unknown organism {organism} - not a SOMA Experiment") + + _obs_query = _build_query(obs_query) + _var_query = _build_query(var_query) + with experiment_query(exp, measurement_name=measurement_name, obs_query=_obs_query, var_query=_var_query) as query: + return query.read_as_anndata(X_name=X_name, column_names=column_names) diff --git a/api/python/cell_census/src/cell_census/open.py b/api/python/cell_census/src/cell_census/open.py new file mode 100644 index 000000000..543792bbb --- /dev/null +++ b/api/python/cell_census/src/cell_census/open.py @@ -0,0 +1,113 @@ +import os.path +import urllib.parse +from typing import Optional + +import s3fs +import tiledb +import tiledbsoma as soma + +from .release_directory import CensusLocator, CensusReleaseDescription, get_release_description +from .util import uri_join + +# TODO: temporary work-around for lack of contenxt/config in tiledbsoma. Replace with soma +# `platform_config` when available. +DEFAULT_TILEDB_CONFIGURATION = { + # https://docs.tiledb.com/main/how-to/configuration#configuration-parameters + "py.init_buffer_bytes": 1 * 1024**3, + "soma.init_buffer_bytes": 1 * 1024**3, + "py.deduplicate": "true", +} + + +def _open_soma(description: CensusReleaseDescription) -> soma.Collection: + locator = description["soma"] + tiledb_config = {**DEFAULT_TILEDB_CONFIGURATION} + s3_region = locator.get("s3_region", None) + if s3_region is not None: + tiledb_config["vfs.s3.region"] = locator["s3_region"] + return soma.Collection(uri=locator["uri"], ctx=tiledb.Ctx(tiledb_config)) + + +def open_soma(*, census_version: Optional[str] = "latest", uri: Optional[str] = None) -> soma.Collection: + """ + Open the Cell Census by version (name) or URI, returning a soma.Collection containing + the top-level census. + + TODO: add platform_config hook when it is further defined, allowing config overrides. + """ + + if uri is not None: + return soma.Collection(uri=uri, ctx=tiledb.Ctx(DEFAULT_TILEDB_CONFIGURATION)) + + if census_version is None: + raise ValueError("Must specify either a cell census version or an explicit URI.") + + description = get_release_description(census_version) # raises + return _open_soma(description) + + +def get_source_h5ad_uri(dataset_id: str, *, census_version: str = "latest") -> CensusLocator: + """ + Open the named version of the census, and return the URI for the dataset_id. + + This does not guarantee that the H5AD exists or is accessible to the user. + + Raises if dataset_id or census_version are unknown. + """ + description = get_release_description(census_version) # raises + census = _open_soma(description) + dataset = census["census_info"]["datasets"].read_as_pandas_all(value_filter=f"dataset_id == '{dataset_id}'") + if len(dataset) == 0: + raise KeyError("Unknown dataset_id") + + locator = description["h5ads"].copy() + h5ads_base_uri = locator["uri"] + dataset_h5ad_path = dataset.dataset_h5ad_path.iloc[0] + locator["uri"] = uri_join(h5ads_base_uri, dataset_h5ad_path) + return locator + + +def download_source_h5ad(dataset_id: str, to_path: str, *, census_version: str = "latest") -> None: + """ + Download the source H5AD dataset, for the given dataset_id, to the user-specified + file name. + + Will raise an error if the path already exists (i.e., will not overwrite + an existing file), or is not a file. + + Parameters + ---------- + dataset_id : str + Fetch the source (original) H5AD associated with this dataset_id. + to_path : str + The file name where the downloaded H5AD will be written. Must not already exist. + census_version : str + The census version tag. Defaults to ``latest``. + + Returns + ------- + None + + See Also + -------- + get_source_h5ad_uri : Look up the location of the source H5AD. + + Examples + -------- + >>> download_source_h5ad("8e47ed12-c658-4252-b126-381df8d52a3d", to_path="/tmp/data.h5ad") + + """ + if os.path.exists(to_path): + raise ValueError("Path exists - will not overwrite existing file.") + if to_path.endswith("/"): + raise ValueError("Specify to_path as a file name, not a directory name.") + + locator = get_source_h5ad_uri(dataset_id, census_version=census_version) + protocol = urllib.parse.urlparse(locator["uri"]).scheme + assert protocol == "s3" + + fs = s3fs.S3FileSystem( + anon=True, + cache_regions=True, + ) + fs.get_file(locator["uri"], to_path) diff --git a/api/python/cell_census/src/cell_census/release_directory.py b/api/python/cell_census/src/cell_census/release_directory.py new file mode 100644 index 000000000..a9cea236f --- /dev/null +++ b/api/python/cell_census/src/cell_census/release_directory.py @@ -0,0 +1,67 @@ +from typing import Dict, Optional, TypedDict, Union, cast + +import requests + +""" +The following types describe the expected directory of Cell Census builds, used +to bootstrap all data location requests. +""" +CensusReleaseTag = str # name or version of census, eg, "release-99" or "2022-10-01-test" +CensusLocator = TypedDict( + "CensusLocator", + { + "uri": str, # resource URI + "s3_region": Optional[str], # if an S3 URI, has optional region + }, +) +CensusReleaseDescription = TypedDict( + "CensusReleaseDescription", + { + "release_date": Optional[str], # date of release, optional + "release_build": str, # date of build + "soma": CensusLocator, + "h5ads": CensusLocator, + }, +) +CensusDirectory = Dict[CensusReleaseTag, Union[CensusReleaseTag, CensusReleaseDescription]] + + +# URL for the default top-level directory of all public data, formatted as a CensusDirectory +CELL_CENSUS_RELEASE_DIRECTORY_URL = "https://s3.us-west-2.amazonaws.com/cellxgene-data-public/cell-census/release.json" + + +def get_release_description(tag: str) -> CensusReleaseDescription: + """Get release description for given tag. Raises KeyError if unknown tag value.""" + census_directory = get_directory() + description = census_directory.get(tag, None) + if description is None: + raise KeyError(f"Unable to locate cell census version: {tag}.") + return description + + +def get_directory() -> Dict[CensusReleaseTag, CensusReleaseDescription]: + """ + Get the directory of cell census releases available. + """ + response = requests.get(CELL_CENSUS_RELEASE_DIRECTORY_URL) + response.raise_for_status() + directory: CensusDirectory = cast(CensusDirectory, response.json()) + + # Resolve all aliases for easier use + for tag in list(directory.keys()): + # Strings are aliases for other tags + points_at = directory[tag] + while isinstance(points_at, str): + # resolve aliases + if points_at not in directory: + # oops, dangling pointer -- drop original tag + directory.pop(tag) + break + + points_at = directory[points_at] + + if isinstance(points_at, dict): + directory[tag] = points_at + + # Cast is safe, as we have removed all tag aliases + return cast(Dict[CensusReleaseTag, CensusReleaseDescription], directory) diff --git a/api/python/cell_census/src/cell_census/util.py b/api/python/cell_census/src/cell_census/util.py new file mode 100644 index 000000000..360d61eab --- /dev/null +++ b/api/python/cell_census/src/cell_census/util.py @@ -0,0 +1,15 @@ +import urllib.parse + + +def uri_join(base: str, url: str) -> str: + """ + like urllib.parse.urljoin, but doesn't get confused by S3:// + """ + p_url = urllib.parse.urlparse(url) + if p_url.netloc: + return url + + p_base = urllib.parse.urlparse(base) + path = urllib.parse.urljoin(p_base.path, p_url.path) + parts = [p_base.scheme, p_base.netloc, path, p_url.params, p_url.query, p_url.fragment] + return urllib.parse.urlunparse(parts) diff --git a/api/python/notebooks/README.md b/api/python/notebooks/README.md new file mode 100644 index 000000000..ea39208ff --- /dev/null +++ b/api/python/notebooks/README.md @@ -0,0 +1,58 @@ +# ReadMe + +Demonstration notebooks for the CELLxGENE Cell Census + +This is a quick start on how to run the notebooks. It is Linux-flavored. + +## Dependencies + +You must be on a Linux or MacOS system, with the following installed: +* Python 3.9+ +* C++ 17 build tools +* cmake 3.21 or later +* git +* Jupyter or some other means of running notebooks (e.g., vscode) + +For now, it is recommended that you do all this on an host with sufficient memory, +and a high bandwidth connection to AWS S3 in the us-west-2 region, e.g., an m6i.16xlarge. +If you utilize AWS, Ubuntu 20 or 22 AMI are recommended (AWS AMI should work fine, but has +not been tested). + +I also recommend you use a `d` instance type, and mount all of the NVME drives as swap, +as it will keep you from running out of RAM. + +## Step 1: Clone Repos + +On your target host: +1. Make a new working directory and `cd` into it +2. Clone both TileDB-SOMA and soma-scratch. +```bash +$ git clone https://github.com/single-cell-data/TileDB-SOMA.git +$ git clone https://github.com/chanzuckerberg/cell-census.git +``` + +## Step 2: Set up Python environment +1. In your working directory, make and activate a virtual environment. +```shell + $ python -m venv ./venv + $ source ./venv/bin/activate +``` +2. Build and install SOMA into your virtual environment by following the instructions in `TileDB-SOMA/apis/python/README.md` +3. Install the `cell_census` package: +```shell + $ pip install -e cell-census/api/python/cell_census/ +``` +4. Install packages needed to run notebooks: +```shell + $ pip install scikit-misc +``` + +## Verify your installation +Check that your installation works - this make take a few seconds, as it loads metadata from S3: +```shell +$ python -c 'import cell_census; print(cell_census.open_soma().soma_type)' +SOMACollection +``` + +## Run notebooks +Run notebooks, which you can find in the `cell-census/api/python/notebooks` directory. diff --git a/api/python/notebooks/census_axis_query.ipynb b/api/python/notebooks/census_axis_query.ipynb new file mode 100644 index 000000000..9730a3fac --- /dev/null +++ b/api/python/notebooks/census_axis_query.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Axis Query Example\n", + "\n", + "_Goal:_ demonstrate basic axis metadata handling using Pandas.\n", + "\n", + "The CZ Cell Census stores obs (cell) metadata in a SOMA DataFrame, which can be queried and read as a Pandas DataFrame. The Cell Census also has a convenience package which simplifies opening the census.\n", + "\n", + "Pandas DataFrame is an in-memory object. Take care that queries are small enough for results to fit in memory.\n", + "\n", + "## Open the census\n", + "\n", + "The `cell_census` Python package contains a convenient API to open the latest version of the Cell Census." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cell_census\n", + "\n", + "census = cell_census.open_soma()\n", + "human = census[\"census_data\"][\"homo_sapiens\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarize Census cell metadata\n", + "\n", + "Tips:\n", + "\n", + "- `read_as_pandas()` and `read_as_pandas_all()` return standard Pandas DataFrame objects, allowing the use of Pandas API.\n", + "- Queries will be much faster if you request only the DataFrame columns required for your analysis (e.g., `column_names=[\"cell_type_ontology_term_id\"]`).\n", + "- You can also further refine query results by using a `value_filter`\n", + "\n", + "### Example 1 - Summarize all cell types\n", + "\n", + "This example reads the cell metadata (obs) into a Pandas DataFrame, and summarizes in a variety of ways using Pandas API." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 545 cell types in the Cell Census! The first 10 are: ['CL:1000329', 'CL:0000787', 'CL:0000798', 'CL:0000909', 'CL:0000151', 'CL:1000348', 'CL:0000064', 'CL:0000576', 'CL:0000451', 'CL:0000898']\n", + "\n", + "The top 10 cell types and their counts are:\n", + "CL:0000679 1889047\n", + "CL:0000235 1374219\n", + "CL:0000624 1286344\n", + "CL:0000860 1272977\n", + "CL:0000625 1244993\n", + "CL:0000623 1031420\n", + "CL:0000236 945552\n", + "CL:0001064 797557\n", + "CL:0000057 741330\n", + "CL:0000746 731139\n", + "Name: cell_type_ontology_term_id, dtype: int64\n" + ] + } + ], + "source": [ + "# Read into a pandas dataframe.\n", + "obs_df = human.obs.read_as_pandas_all(column_names=[\"cell_type_ontology_term_id\"])\n", + "\n", + "# Use Pandas API to find all unique values in the `cell_type_ontology_term_id` column.\n", + "unique_cell_type_ontology_term_id = obs_df.cell_type_ontology_term_id.unique()\n", + "\n", + "# Display only the first 10, as there are a LOT!\n", + "print(\n", + " f\"There are {len(unique_cell_type_ontology_term_id)} cell types in the Cell Census! The first 10 are:\",\n", + " unique_cell_type_ontology_term_id[0:10].tolist(),\n", + ")\n", + "\n", + "# Using Pandas API, count the instances of each cell type term and return the top 10.\n", + "top_10 = obs_df.cell_type_ontology_term_id.value_counts()[0:10]\n", + "print(\"\\nThe top 10 cell types and their counts are:\")\n", + "print(top_10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summarize a subset of cell types, selected with a `value_fitler`\n", + "\n", + "This example utilizes a SOMA \"value filter\" to read the subset of cells with `tissue_ontologyy_term_id` equal to `UBERON:0002048` (lung tissue), and summarizes the query result using Pandas." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 176 cell types in the Cell Census where tissue_ontology_term_id == UBERON:0002048! The first 10 are: ['CL:0002325', 'CL:0000064', 'CL:0000875', 'CL:0000236', 'CL:0000623', 'CL:0000235', 'CL:0000084', 'CL:0000003', 'CL:0000186', 'CL:0000115']\n", + "\n", + "Top 10 cell types where tissue_ontology_term_id == UBERON:0002048\n", + "CL:0000235 514828\n", + "CL:0000583 317503\n", + "CL:0000624 265512\n", + "CL:0000625 248053\n", + "CL:0000003 168203\n", + "CL:0000623 164002\n", + "CL:0000860 160365\n", + "CL:0001064 149067\n", + "CL:0002063 142612\n", + "CL:0002632 126058\n", + "Name: cell_type_ontology_term_id, dtype: int64\n" + ] + } + ], + "source": [ + "# Count cell_type occurrences for cells with tissue == 'lung'\n", + "\n", + "# Read cell_type terms for cells which have a specific tissue term\n", + "LUNG_TISSUE = \"UBERON:0002048\"\n", + "\n", + "obs_df = human.obs.read_as_pandas_all(\n", + " column_names=[\"cell_type_ontology_term_id\"],\n", + " value_filter=f\"tissue_ontology_term_id == '{LUNG_TISSUE}'\",\n", + ")\n", + "\n", + "# Use Pandas API to find all unique values in the `cell_type_ontology_term_id` column.\n", + "unique_cell_type_ontology_term_id = obs_df.cell_type_ontology_term_id.unique()\n", + "\n", + "print(\n", + " f\"There are {len(unique_cell_type_ontology_term_id)} cell types in the Cell Census where tissue_ontology_term_id == {LUNG_TISSUE}! The first 10 are:\",\n", + " unique_cell_type_ontology_term_id[0:10].tolist(),\n", + ")\n", + "\n", + "# Use Pandas API to count, and grab 10 most common\n", + "top_10 = obs_df.cell_type_ontology_term_id.value_counts()[0:10]\n", + "print(f\"\\nTop 10 cell types where tissue_ontology_term_id == {LUNG_TISSUE}\")\n", + "print(top_10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also define much more complex value filters. For example:\n", + "* combine terms with `and` and `or`\n", + "* use the `in` operator to query on multiple values" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CL:0000746 158188\n", + "CL:0008034 84750\n", + "CL:0002548 79618\n", + "CL:0000115 64114\n", + "CL:0002131 61830\n", + "CL:0000763 31318\n", + "CL:0000669 27104\n", + "CL:0000003 22650\n", + "CL:0000057 19380\n", + "CL:0002144 18050\n", + "Name: cell_type_ontology_term_id, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# You can also do more complex queries, such as testing for inclusion in a list of values\n", + "\n", + "VENTRICLES = [\"UBERON:0002082\", \"UBERON:OOO2084\", \"UBERON:0002080\"]\n", + "\n", + "obs_df = human.obs.read_as_pandas_all(\n", + " column_names=[\"cell_type_ontology_term_id\"],\n", + " value_filter=f\"tissue_ontology_term_id in {VENTRICLES}\",\n", + ")\n", + "\n", + "# Use Pandas API to summarize\n", + "top_10 = obs_df.cell_type_ontology_term_id.value_counts()[0:10]\n", + "display(top_10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Full census stats\n", + "\n", + "This example queries all organisms in the Census, and summarizes the diversity of various metadata lables." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Complete cell census contains 37525571 cells.\n", + "homo_sapiens\n", + "\tUnique cell_type_ontology_term_id values: 545\n", + "\tUnique assay_ontology_term_id values: 21\n", + "\tUnique tissue_ontology_term_id values: 178\n", + "mus_musculus\n", + "\tUnique cell_type_ontology_term_id values: 206\n", + "\tUnique assay_ontology_term_id values: 8\n", + "\tUnique tissue_ontology_term_id values: 40\n" + ] + } + ], + "source": [ + "COLS_TO_QUERY = [\n", + " \"cell_type_ontology_term_id\",\n", + " \"assay_ontology_term_id\",\n", + " \"tissue_ontology_term_id\",\n", + "]\n", + "\n", + "obs_df = {\n", + " name: experiment.obs.read_as_pandas_all(column_names=COLS_TO_QUERY)\n", + " for name, experiment in census[\"census_data\"].items()\n", + "}\n", + "\n", + "# Use Pandas API to summarize each organism\n", + "print(f\"Complete cell census contains {sum(len(df) for df in obs_df.values())} cells.\")\n", + "for organism, df in obs_df.items():\n", + " print(organism)\n", + " for col in COLS_TO_QUERY:\n", + " print(f\"\\tUnique {col} values: {len(df[col].unique())}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_compute_over_X.ipynb b/api/python/notebooks/census_compute_over_X.ipynb new file mode 100644 index 000000000..c9372c129 --- /dev/null +++ b/api/python/notebooks/census_compute_over_X.ipynb @@ -0,0 +1,764 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Computing on X: using online algorithms\n", + "\n", + "*Goal:* demonstrate larger-than-core computation on the X matrix, using \"online\" algorithms to process data incrementally.\n", + "\n", + "This notebook computes a variety of per-gene and per-cell statistics for a user-defined query.\n", + "\n", + "*NOTE*: when query results are small, it may be easier to use the SOMAExperment Query class to extract an AnnData, and then just compute over that. This notebook is showing means of incrementally processing larger-than-core (RAM) data, where incremental (online) algorithms are used.\n", + "\n", + "\n", + "First, open up part of the census." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import cell_census\n", + "from cell_census.experiment_query import experiment_query, AxisQuery\n", + "\n", + "census = cell_census.open_soma()\n", + "mouse = census[\"census_data\"][\"mus_musculus\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Simple incremental (aka \"online\") calculations\n", + "\n", + "Many statistics, such as `mean`, are easy to calculate incrementally. This cell demonstrates a query on the `X['raw']` sparse nD array, which will return results in batches. Accumulate the sum and count incrementally, into `raw_sum` and `raw_n`, and then compute mean.\n", + "\n", + "First define a query - in this case a slice over the obs axis for cells with a specific tissue & sex value, and all genes on the var axis. The `query.X()` method returns an iterator of results, each as a PyArrow Table. Each table will contain the sparse X data and obs/var coordinates, using standad SOMA names:\n", + "* `soma_data` - the X value (float32)\n", + "* `soma_dim_0` - the obs coordinate (int64)\n", + "* `soma_dim_1` - the var coordinate (int64)\n", + "\n", + "**Important**: the X matrices are joined to var/obs axis DataFrames by an integer join \"id\" (aka `soma_joinid`). They are *NOT* positionally indexed, and any given cell or gene may have a `soma_joinid` of any value (e.g., a large integer). In other words, for any given `X` value, the `soma_dim_0` corresponds to the `soma_joinid` in the `obs` dataframe, and the `soma_dim_` coordinate corresponds to the `soma_joinid` in the `var` dataframe.\n", + "\n", + "For convenience, the query package contains a utility function to simplify operations on query slices. `query.get_indexer()` returns an indexer that can be used to wrap the output of `query.X()`, converting from `soma_joinids` to positional indexing. Positions are `[0, N)`, where `N` are the number of results on the query for any given axis (equivalent to the Pandas `.iloc` of the axis dataframe).\n", + "\n", + "Key points:\n", + "* it is expensive to query and read the results - so rather than make multiple passes over the data, read it once and perform multiple computations.\n", + "* by default, data in the census is indexed by `soma_joinid` and not positionally. Use `query.get_indexer()` if you want positions." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_idfeature_namefeature_lengthraw_nraw_mean
soma_joinid
0ENSMUSG000001096440610005C13Rik35831780.505840
1ENSMUSG000000077770610009B22Rik998463043.585939
2ENSMUSG000000867140610009E02Rik180300.000000
3ENSMUSG000000436440610009L18Rik61913707.246176
4ENSMUSG000000208310610010K14Rik1896922495.930902
..................
52368ENSMUSG00000109857Gm53058284600.000000
52369ENSMUSG000001185781700014B07Rik81800.000000
52370ENSMUSG00000118550Gm52965152400.000000
52371ENSMUSG00000117608Gm53018245500.000000
52372ENSMUSG00000118094Gm52988460400.000000
\n", + "

52373 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " feature_id feature_name feature_length raw_n \\\n", + "soma_joinid \n", + "0 ENSMUSG00000109644 0610005C13Rik 3583 178 \n", + "1 ENSMUSG00000007777 0610009B22Rik 998 4630 \n", + "2 ENSMUSG00000086714 0610009E02Rik 1803 0 \n", + "3 ENSMUSG00000043644 0610009L18Rik 619 1370 \n", + "4 ENSMUSG00000020831 0610010K14Rik 1896 9224 \n", + "... ... ... ... ... \n", + "52368 ENSMUSG00000109857 Gm53058 2846 0 \n", + "52369 ENSMUSG00000118578 1700014B07Rik 818 0 \n", + "52370 ENSMUSG00000118550 Gm52965 1524 0 \n", + "52371 ENSMUSG00000117608 Gm53018 2455 0 \n", + "52372 ENSMUSG00000118094 Gm52988 4604 0 \n", + "\n", + " raw_mean \n", + "soma_joinid \n", + "0 0.505840 \n", + "1 43.585939 \n", + "2 0.000000 \n", + "3 7.246176 \n", + "4 95.930902 \n", + "... ... \n", + "52368 0.000000 \n", + "52369 0.000000 \n", + "52370 0.000000 \n", + "52371 0.000000 \n", + "52372 0.000000 \n", + "\n", + "[52373 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with experiment_query(\n", + " mouse, measurement_name=\"RNA\", obs_query=AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\")\n", + ") as query:\n", + " var_df = query.var().to_pandas().set_index(\"soma_joinid\")\n", + " n_vars = len(var_df)\n", + "\n", + " raw_n = np.zeros((n_vars,), dtype=np.int64) # accumulate number of non-zero X values\n", + " raw_sum = np.zeros((n_vars,), dtype=np.float64) # accumulate the sum of expression\n", + "\n", + " # query.X() returns an iterator of pyarrow.Table, with X data in COO format.\n", + " # You can request an indexer from the query that will map it to positional indices\n", + " indexer = query.get_indexer()\n", + " for arrow_tbl in query.X(\"raw\"):\n", + " var_dim = indexer.var_index(arrow_tbl[\"soma_dim_1\"])\n", + " data = arrow_tbl[\"soma_data\"]\n", + " np.add.at(raw_n, var_dim, 1)\n", + " np.add.at(raw_sum, var_dim, data)\n", + "\n", + "with np.errstate(divide=\"ignore\", invalid=\"ignore\"):\n", + " raw_mean = raw_sum / query.n_obs\n", + "raw_mean[np.isnan(raw_mean)] = 0\n", + "\n", + "var_df = var_df.assign(raw_n=pd.Series(data=raw_n, index=var_df.index))\n", + "var_df = var_df.assign(raw_mean=pd.Series(data=raw_mean, index=var_df.index))\n", + "\n", + "display(var_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex online calculations\n", + "\n", + "Other statistics are not as simple when implemented as an online algorithm. This cell demonstrates an implementation of an online computation of `variance`, using [Welford's online calculation of mean and variance](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm).\n", + "\n", + "This code is also available in the `cell_census.compute` module." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_idfeature_namefeature_lengthraw_meanraw_variance
soma_joinid
0ENSMUSG000001096440610005C13Rik35830.505840440.270924
1ENSMUSG000000077770610009B22Rik99843.585939150808.450302
2ENSMUSG000000867140610009E02Rik18030.0000000.000000
3ENSMUSG000000436440610009L18Rik6197.2461768274.477431
4ENSMUSG000000208310610010K14Rik189695.930902296255.749040
..................
52368ENSMUSG00000109857Gm5305828460.0000000.000000
52369ENSMUSG000001185781700014B07Rik8180.0000000.000000
52370ENSMUSG00000118550Gm5296515240.0000000.000000
52371ENSMUSG00000117608Gm5301824550.0000000.000000
52372ENSMUSG00000118094Gm5298846040.0000000.000000
\n", + "

52373 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " feature_id feature_name feature_length raw_mean \\\n", + "soma_joinid \n", + "0 ENSMUSG00000109644 0610005C13Rik 3583 0.505840 \n", + "1 ENSMUSG00000007777 0610009B22Rik 998 43.585939 \n", + "2 ENSMUSG00000086714 0610009E02Rik 1803 0.000000 \n", + "3 ENSMUSG00000043644 0610009L18Rik 619 7.246176 \n", + "4 ENSMUSG00000020831 0610010K14Rik 1896 95.930902 \n", + "... ... ... ... ... \n", + "52368 ENSMUSG00000109857 Gm53058 2846 0.000000 \n", + "52369 ENSMUSG00000118578 1700014B07Rik 818 0.000000 \n", + "52370 ENSMUSG00000118550 Gm52965 1524 0.000000 \n", + "52371 ENSMUSG00000117608 Gm53018 2455 0.000000 \n", + "52372 ENSMUSG00000118094 Gm52988 4604 0.000000 \n", + "\n", + " raw_variance \n", + "soma_joinid \n", + "0 440.270924 \n", + "1 150808.450302 \n", + "2 0.000000 \n", + "3 8274.477431 \n", + "4 296255.749040 \n", + "... ... \n", + "52368 0.000000 \n", + "52369 0.000000 \n", + "52370 0.000000 \n", + "52371 0.000000 \n", + "52372 0.000000 \n", + "\n", + "[52373 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numba\n", + "import numpy.typing as npt\n", + "\n", + "\n", + "class OnlineMatrixMeanVariance:\n", + " n_samples: int\n", + " n_variables: int\n", + "\n", + " def __init__(self, n_samples: int, n_variables: int):\n", + " \"\"\"\n", + " Compute mean and variance for n_variables over n_samples, encoded\n", + " in a COO format. Equivalent to:\n", + " numpy.mean(data, axis=0)\n", + " numpy.var(data, axix=0)\n", + " where the input `data` is of shape (n_samples, n_variables)\n", + " \"\"\"\n", + " self.n_samples = n_samples\n", + " self.n_variables = n_variables\n", + "\n", + " self.n_a = np.zeros((n_variables,), dtype=np.int32)\n", + " self.u_a = np.zeros((n_variables,), dtype=np.float64)\n", + " self.M2_a = np.zeros((n_variables,), dtype=np.float64)\n", + "\n", + " def update(self, coord_vec: npt.NDArray[np.int64], value_vec: npt.NDArray[np.float32]) -> None:\n", + " _mean_variance_update(coord_vec, value_vec, self.n_a, self.u_a, self.M2_a)\n", + "\n", + " def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:\n", + " \"\"\"\n", + " Returns tuple containing mean and variance\n", + " \"\"\"\n", + " u, M2 = _mean_variance_finalize(self.n_samples, self.n_a, self.u_a, self.M2_a)\n", + "\n", + " # compute sample variance\n", + " var = M2 / max(1, (self.n_samples - 1))\n", + "\n", + " return u, var\n", + "\n", + "\n", + "@numba.jit(nopython=True)\n", + "def _mean_variance_update(\n", + " col_arr: npt.NDArray[np.int64],\n", + " val_arr: npt.NDArray[np.float32],\n", + " n: npt.NDArray[np.int32],\n", + " u: npt.NDArray[np.float64],\n", + " M2: npt.NDArray[np.float64],\n", + "):\n", + " \"\"\"\n", + " Incrementally accumulate mean and sum of square of distance from mean using\n", + " Welford's online method.\n", + " \"\"\"\n", + " for col, val in zip(col_arr, val_arr):\n", + " u_prev = u[col]\n", + " M2_prev = M2[col]\n", + " n[col] += 1\n", + " u[col] = u_prev + (val - u_prev) / n[col]\n", + " M2[col] = M2_prev + (val - u_prev) * (val - u[col])\n", + "\n", + "\n", + "@numba.jit(nopython=True)\n", + "def _mean_variance_finalize(\n", + " n_samples: int, n_a: npt.NDArray[np.int32], u_a: npt.NDArray[np.float64], M2_a: npt.NDArray[np.float64]\n", + "):\n", + " \"\"\"\n", + " Finalize incremental values, acconting for missing elements (due to sparse input).\n", + " Non-sparse and sparse combined using Chan's parallel adaptation of Welford's.\n", + " The code assumes the sparse elements are all zero and ignores those terms.\n", + " \"\"\"\n", + " n_b = n_samples - n_a\n", + " delta = -u_a # assumes u_b == 0\n", + " u = (n_a * u_a) / n_samples\n", + " M2 = M2_a + delta**2 * n_a * n_b / n_samples # assumes M2_b == 0\n", + " return u, M2\n", + "\n", + "\n", + "with experiment_query(\n", + " mouse, measurement_name=\"RNA\", obs_query=AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\")\n", + ") as query:\n", + " var_df = query.var().to_pandas().set_index(\"soma_joinid\")\n", + " n_vars = len(var_df)\n", + "\n", + " indexer = query.get_indexer()\n", + " mvn = OnlineMatrixMeanVariance(query.n_obs, n_vars)\n", + " for arrow_tbl in query.X(\"raw\"):\n", + " var_dim = indexer.var_index(arrow_tbl[\"soma_dim_1\"])\n", + " data = arrow_tbl[\"soma_data\"].to_numpy()\n", + " mvn.update(var_dim, data)\n", + "\n", + " u, v = mvn.finalize()\n", + "\n", + "var_df = var_df.assign(raw_mean=pd.Series(data=u, index=var_df.index))\n", + "var_df = var_df.assign(raw_variance=pd.Series(data=v, index=var_df.index))\n", + "\n", + "display(var_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A more complex example - counting cells per feature, grouped by dataset_id\n", + "\n", + "This example demonstrates a more complex example where the goal is to count the number of cells per gene, grouped by cell dataset_id. The result is a Pandas DataFrame indexed by `obs.dataset_id` and `var.feature_id`, containing the number of cells per pair.\n", + "\n", + "This example does not use positional indexing, but rather demonstrates the use of Pandas DataFrame `join` to join on the `soma_joinid`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
n_cellsfeature_name
dataset_idfeature_id
66ff82b4-9380-469c-bc4b-cfa08eacd325ENSMUSG00000109644500610005C13Rik
98e5ea9f-16d6-47ec-a529-686e76515e39ENSMUSG000001096441460610005C13Rik
c08f8441-4a10-4748-872a-e70c0bcccdbaENSMUSG00000109644960610005C13Rik
66ff82b4-9380-469c-bc4b-cfa08eacd325ENSMUSG0000000777716490610009B22Rik
98e5ea9f-16d6-47ec-a529-686e76515e39ENSMUSG0000000777737420610009B22Rik
............
66ff82b4-9380-469c-bc4b-cfa08eacd325ENSMUSG000001173102690Ptp4a1_ENSMUSG00000117310
98e5ea9f-16d6-47ec-a529-686e76515e39ENSMUSG000001173103759Ptp4a1_ENSMUSG00000117310
c08f8441-4a10-4748-872a-e70c0bcccdbaENSMUSG000001173101069Ptp4a1_ENSMUSG00000117310
66ff82b4-9380-469c-bc4b-cfa08eacd325ENSMUSG000000880251Rprl3
98e5ea9f-16d6-47ec-a529-686e76515e39ENSMUSG000000880251Rprl3
\n", + "

61938 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " n_cells \\\n", + "dataset_id feature_id \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000109644 50 \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000109644 146 \n", + "c08f8441-4a10-4748-872a-e70c0bcccdba ENSMUSG00000109644 96 \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000007777 1649 \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000007777 3742 \n", + "... ... \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000117310 2690 \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000117310 3759 \n", + "c08f8441-4a10-4748-872a-e70c0bcccdba ENSMUSG00000117310 1069 \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000088025 1 \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000088025 1 \n", + "\n", + " feature_name \n", + "dataset_id feature_id \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000109644 0610005C13Rik \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000109644 0610005C13Rik \n", + "c08f8441-4a10-4748-872a-e70c0bcccdba ENSMUSG00000109644 0610005C13Rik \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000007777 0610009B22Rik \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000007777 0610009B22Rik \n", + "... ... \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000117310 Ptp4a1_ENSMUSG00000117310 \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000117310 Ptp4a1_ENSMUSG00000117310 \n", + "c08f8441-4a10-4748-872a-e70c0bcccdba ENSMUSG00000117310 Ptp4a1_ENSMUSG00000117310 \n", + "66ff82b4-9380-469c-bc4b-cfa08eacd325 ENSMUSG00000088025 Rprl3 \n", + "98e5ea9f-16d6-47ec-a529-686e76515e39 ENSMUSG00000088025 Rprl3 \n", + "\n", + "[61938 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from cell_census.experiment_query import X_as_series\n", + "\n", + "with experiment_query(\n", + " mouse,\n", + " measurement_name=\"RNA\",\n", + " obs_query=AxisQuery(value_filter=\"tissue=='brain'\"),\n", + ") as query:\n", + " obs_df = query.obs(column_names=[\"soma_joinid\", \"dataset_id\"]).to_pandas().set_index(\"soma_joinid\")\n", + " var_df = query.var().to_pandas().set_index(\"soma_joinid\")\n", + " n_cells_by_dataset = pd.Series(\n", + " 0,\n", + " index=pd.MultiIndex.from_product(\n", + " (var_df.index, obs_df.dataset_id.unique()), names=[\"soma_joinid\", \"dataset_id\"]\n", + " ),\n", + " dtype=np.int64,\n", + " name=\"n_cells\",\n", + " )\n", + "\n", + " for X_tbl in query.X(\"raw\"):\n", + " # Group by dataset_id and count unique (genes, dataset_id)\n", + " value_counts = (\n", + " X_as_series(X_tbl)\n", + " .to_frame()\n", + " .join(obs_df[[\"dataset_id\"]], on=\"soma_dim_0\")\n", + " .reset_index(level=1)\n", + " .drop(columns=[\"soma_data\"])\n", + " .value_counts()\n", + " )\n", + " np.add.at(n_cells_by_dataset, n_cells_by_dataset.index.get_indexer(value_counts.index), value_counts.to_numpy())\n", + "\n", + "# drop any combinations that are not observed\n", + "n_cells_by_dataset = n_cells_by_dataset[n_cells_by_dataset > 0]\n", + "\n", + "# and join with var_df to pick up feature_id and feature_name\n", + "n_cells_by_dataset = (\n", + " n_cells_by_dataset.to_frame()\n", + " .reset_index(level=1)\n", + " .join(var_df[[\"feature_id\", \"feature_name\"]])\n", + " .set_index([\"dataset_id\", \"feature_id\"])\n", + ")\n", + "\n", + "display(n_cells_by_dataset)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_dataset_presence.ipynb b/api/python/notebooks/census_dataset_presence.ipynb new file mode 100644 index 000000000..252138ab5 --- /dev/null +++ b/api/python/notebooks/census_dataset_presence.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Census datasets presence\n", + "\n", + "*Goal:* demonstrate basic use of the `datasets_presence_matrix` array.\n", + "\n", + "The presence matrix is a sparse array, indicating which features (var) were present in each dataset. The array has dimensions [n_datasets, n_var], and is stored in the SOMA Measurement `varp` collection. The first dimension is indexed by the `soma_joinid` in the `census_datasets` dataframe. The second is indexed by the `soma_joinid` in the `var` dataframe of the measurement." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
soma_joinidcollection_idcollection_namecollection_doidataset_iddataset_titledataset_h5ad_pathdataset_total_cell_count
0003f821b4-87be-4ff4-b65a-b5fc00061da7Local and systemic responses to SARS-CoV-2 inf...10.1038/s41586-021-04345-xedc8d3fe-153c-4e3d-8be0-2108d30f8d70Airwayedc8d3fe-153c-4e3d-8be0-2108d30f8d70.h5ad236977
1103f821b4-87be-4ff4-b65a-b5fc00061da7Local and systemic responses to SARS-CoV-2 inf...10.1038/s41586-021-04345-x2a498ace-872a-4935-984b-1afa70fd9886PBMC2a498ace-872a-4935-984b-1afa70fd9886.h5ad422220
2243d4bb39-21af-4d05-b973-4c1fed7b916cTranscriptional Programming of Normal and Infl...10.1016/j.celrep.2018.09.006f512b8b6-369d-4a85-a695-116e0806857fSkinf512b8b6-369d-4a85-a695-116e0806857f.h5ad68036
330434a9d4-85fd-4554-b8e3-cf6c582bb2faAcute COVID-19 cohort across a range of WHO ca...10.1101/2020.11.20.20227355fa8605cf-f27e-44af-ac2a-476bee4410d3PBMCsfa8605cf-f27e-44af-ac2a-476bee4410d3.h5ad59506
443472f32d-4a33-48e2-aad5-666d4631bf4cA single-cell transcriptome atlas of the adult...10.15252/embj.2018100811d5c67a4e-a8d9-456d-a273-fa01adb1b308Retinad5c67a4e-a8d9-456d-a273-fa01adb1b308.h5ad19694
...........................
347347f70ebd97-b3bc-44fe-849d-c18e08fe773dA transcriptomic atlas of the mouse cerebellum...10.1101/2020.03.04.976407e0ed3c55-aff6-4bb7-b6ff-98a2d90b890cA transcriptomic atlas of the mouse cerebellume0ed3c55-aff6-4bb7-b6ff-98a2d90b890c.h5ad611034
3483485d445965-6f1a-4b68-ba3a-b8f765155d3aA molecular cell atlas of the human lung from ...10.1038/s41586-020-2922-4e04daea4-4412-45b5-989e-76a9be070a89Krasnow Lab Human Lung Cell Atlas, Smart-seq2e04daea4-4412-45b5-989e-76a9be070a89.h5ad9409
3493495d445965-6f1a-4b68-ba3a-b8f765155d3aA molecular cell atlas of the human lung from ...10.1038/s41586-020-2922-48c42cfd0-0b0a-46d5-910c-fc833d83c45eKrasnow Lab Human Lung Cell Atlas, 10X8c42cfd0-0b0a-46d5-910c-fc833d83c45e.h5ad65662
35035017481d16-ee44-49e5-bcf0-28c0780d8c4aSingle-Cell Sequencing of Developing Human Gut...10.1016/j.devcel.2020.11.0108e47ed12-c658-4252-b126-381df8d52a3dPaediatric Human Gut (4-14y)8e47ed12-c658-4252-b126-381df8d52a3d.h5ad22502
35135117481d16-ee44-49e5-bcf0-28c0780d8c4aSingle-Cell Sequencing of Developing Human Gut...10.1016/j.devcel.2020.11.010b46237d1-19c6-4af2-9335-9854634bad16Fetal Human Gut (6-11 PCW)b46237d1-19c6-4af2-9335-9854634bad16.h5ad62849
\n", + "

352 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " soma_joinid collection_id \\\n", + "0 0 03f821b4-87be-4ff4-b65a-b5fc00061da7 \n", + "1 1 03f821b4-87be-4ff4-b65a-b5fc00061da7 \n", + "2 2 43d4bb39-21af-4d05-b973-4c1fed7b916c \n", + "3 3 0434a9d4-85fd-4554-b8e3-cf6c582bb2fa \n", + "4 4 3472f32d-4a33-48e2-aad5-666d4631bf4c \n", + ".. ... ... \n", + "347 347 f70ebd97-b3bc-44fe-849d-c18e08fe773d \n", + "348 348 5d445965-6f1a-4b68-ba3a-b8f765155d3a \n", + "349 349 5d445965-6f1a-4b68-ba3a-b8f765155d3a \n", + "350 350 17481d16-ee44-49e5-bcf0-28c0780d8c4a \n", + "351 351 17481d16-ee44-49e5-bcf0-28c0780d8c4a \n", + "\n", + " collection_name \\\n", + "0 Local and systemic responses to SARS-CoV-2 inf... \n", + "1 Local and systemic responses to SARS-CoV-2 inf... \n", + "2 Transcriptional Programming of Normal and Infl... \n", + "3 Acute COVID-19 cohort across a range of WHO ca... \n", + "4 A single-cell transcriptome atlas of the adult... \n", + ".. ... \n", + "347 A transcriptomic atlas of the mouse cerebellum... \n", + "348 A molecular cell atlas of the human lung from ... \n", + "349 A molecular cell atlas of the human lung from ... \n", + "350 Single-Cell Sequencing of Developing Human Gut... \n", + "351 Single-Cell Sequencing of Developing Human Gut... \n", + "\n", + " collection_doi dataset_id \\\n", + "0 10.1038/s41586-021-04345-x edc8d3fe-153c-4e3d-8be0-2108d30f8d70 \n", + "1 10.1038/s41586-021-04345-x 2a498ace-872a-4935-984b-1afa70fd9886 \n", + "2 10.1016/j.celrep.2018.09.006 f512b8b6-369d-4a85-a695-116e0806857f \n", + "3 10.1101/2020.11.20.20227355 fa8605cf-f27e-44af-ac2a-476bee4410d3 \n", + "4 10.15252/embj.2018100811 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + ".. ... ... \n", + "347 10.1101/2020.03.04.976407 e0ed3c55-aff6-4bb7-b6ff-98a2d90b890c \n", + "348 10.1038/s41586-020-2922-4 e04daea4-4412-45b5-989e-76a9be070a89 \n", + "349 10.1038/s41586-020-2922-4 8c42cfd0-0b0a-46d5-910c-fc833d83c45e \n", + "350 10.1016/j.devcel.2020.11.010 8e47ed12-c658-4252-b126-381df8d52a3d \n", + "351 10.1016/j.devcel.2020.11.010 b46237d1-19c6-4af2-9335-9854634bad16 \n", + "\n", + " dataset_title \\\n", + "0 Airway \n", + "1 PBMC \n", + "2 Skin \n", + "3 PBMCs \n", + "4 Retina \n", + ".. ... \n", + "347 A transcriptomic atlas of the mouse cerebellum \n", + "348 Krasnow Lab Human Lung Cell Atlas, Smart-seq2 \n", + "349 Krasnow Lab Human Lung Cell Atlas, 10X \n", + "350 Paediatric Human Gut (4-14y) \n", + "351 Fetal Human Gut (6-11 PCW) \n", + "\n", + " dataset_h5ad_path dataset_total_cell_count \n", + "0 edc8d3fe-153c-4e3d-8be0-2108d30f8d70.h5ad 236977 \n", + "1 2a498ace-872a-4935-984b-1afa70fd9886.h5ad 422220 \n", + "2 f512b8b6-369d-4a85-a695-116e0806857f.h5ad 68036 \n", + "3 fa8605cf-f27e-44af-ac2a-476bee4410d3.h5ad 59506 \n", + "4 d5c67a4e-a8d9-456d-a273-fa01adb1b308.h5ad 19694 \n", + ".. ... ... \n", + "347 e0ed3c55-aff6-4bb7-b6ff-98a2d90b890c.h5ad 611034 \n", + "348 e04daea4-4412-45b5-989e-76a9be070a89.h5ad 9409 \n", + "349 8c42cfd0-0b0a-46d5-910c-fc833d83c45e.h5ad 65662 \n", + "350 8e47ed12-c658-4252-b126-381df8d52a3d.h5ad 22502 \n", + "351 b46237d1-19c6-4af2-9335-9854634bad16.h5ad 62849 \n", + "\n", + "[352 rows x 8 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "from scipy import sparse\n", + "import cell_census\n", + "\n", + "census = cell_census.open_soma()\n", + "\n", + "# Grab the experiment containing human data, and the measurement therein with RNA\n", + "human = census[\"census_data\"][\"homo_sapiens\"]\n", + "human_rna = human.ms[\"RNA\"]\n", + "\n", + "# The cell census-wide datasets\n", + "datasets_df = census[\"census_info\"][\"datasets\"].read_as_pandas_all()\n", + "display(datasets_df)\n", + "\n", + "# The human RNA presence matrix\n", + "presence = human.ms[\"RNA\"].varp[\"dataset_presence_matrix\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For convenience, read the entire presence matrix (for Homo sapiens) into a SciPy array:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<352x60564 sparse matrix of type ''\n", + "\twith 7220633 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the entire presence matrix. It may be returned in incremental chunks if larger than\n", + "# read buffers, so concatenate into a single scipy.sparse.sp_matrix.\n", + "\n", + "# TODO: TileDB-Py#501 when implemented, will simplify this\n", + "\n", + "arrow_sparse_tensors = [t for t in presence.read_sparse_tensor((slice(None),))]\n", + "flat_arrays = [t.to_numpy() for t in arrow_sparse_tensors]\n", + "data = np.concatenate(tuple(t[0] for t in flat_arrays))\n", + "coords = np.concatenate(tuple(t[1] for t in flat_arrays))\n", + "presence_matrix = sparse.coo_matrix(\n", + " (data.flatten(), (coords.T[0].flatten(), coords.T[1].flatten())), shape=presence.shape\n", + ").tocsr()\n", + "presence_matrix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also need the `var` dataframe, which is read into a Pandas DataFrame for convenient manipulation:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "var_df = human_rna.var.read_as_pandas_all()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Is a feature present in a dataset?\n", + "\n", + "*Goal:* test if a given feature is present in a given dataset.\n", + "\n", + "**Important:** the presence matrix is indexed by soma_joinid, and is *NOT* positionally indexed. In other words:\n", + "* the first dimension of the presence matrix is the dataset's `soma_joinid`, as stored in the `census_datasets` dataframe.\n", + "* the second dimension of the presence matrix is the feature's `soma_joinid`, as stored in the `var` dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature is present.\n" + ] + } + ], + "source": [ + "var_joinid = var_df.loc[var_df.feature_id == \"ENSG00000286096\"].soma_joinid\n", + "dataset_joinid = datasets_df.loc[datasets_df.dataset_id == \"97a17473-e2b1-4f31-a544-44a60773e2dd\"].soma_joinid\n", + "is_present = presence_matrix[dataset_joinid, var_joinid][0, 0]\n", + "print(f'Feature is {\"present\" if is_present else \"not present\"}.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What datasets contain a feature?\n", + "\n", + "*Goal:* look up all datasets that have a feature_id present." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
soma_joinidcollection_idcollection_namecollection_doidataset_iddataset_titledataset_h5ad_pathdataset_total_cell_count
304304e5f58829-1a66-40b5-a624-9046778e74f5Tabula Sapiens10.1126/science.abl4896a68b64d8-aee3-4947-81b7-36b8fe5a44d2Tabula Sapiens - Stromala68b64d8-aee3-4947-81b7-36b8fe5a44d2.h5ad82478
305305e5f58829-1a66-40b5-a624-9046778e74f5Tabula Sapiens10.1126/science.abl489697a17473-e2b1-4f31-a544-44a60773e2ddTabula Sapiens - Epithelial97a17473-e2b1-4f31-a544-44a60773e2dd.h5ad104148
306306e5f58829-1a66-40b5-a624-9046778e74f5Tabula Sapiens10.1126/science.abl4896c5d88abe-f23a-45fa-a534-788985e93dadTabula Sapiens - Immunec5d88abe-f23a-45fa-a534-788985e93dad.h5ad264824
307307e5f58829-1a66-40b5-a624-9046778e74f5Tabula Sapiens10.1126/science.abl48965a11f879-d1ef-458a-910c-9b0bdfca5ebfTabula Sapiens - Endothelial5a11f879-d1ef-458a-910c-9b0bdfca5ebf.h5ad31691
308308e5f58829-1a66-40b5-a624-9046778e74f5Tabula Sapiens10.1126/science.abl489653d208b0-2cfd-4366-9866-c3c6114081bcTabula Sapiens - All Cells53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad483152
\n", + "
" + ], + "text/plain": [ + " soma_joinid collection_id collection_name \\\n", + "304 304 e5f58829-1a66-40b5-a624-9046778e74f5 Tabula Sapiens \n", + "305 305 e5f58829-1a66-40b5-a624-9046778e74f5 Tabula Sapiens \n", + "306 306 e5f58829-1a66-40b5-a624-9046778e74f5 Tabula Sapiens \n", + "307 307 e5f58829-1a66-40b5-a624-9046778e74f5 Tabula Sapiens \n", + "308 308 e5f58829-1a66-40b5-a624-9046778e74f5 Tabula Sapiens \n", + "\n", + " collection_doi dataset_id \\\n", + "304 10.1126/science.abl4896 a68b64d8-aee3-4947-81b7-36b8fe5a44d2 \n", + "305 10.1126/science.abl4896 97a17473-e2b1-4f31-a544-44a60773e2dd \n", + "306 10.1126/science.abl4896 c5d88abe-f23a-45fa-a534-788985e93dad \n", + "307 10.1126/science.abl4896 5a11f879-d1ef-458a-910c-9b0bdfca5ebf \n", + "308 10.1126/science.abl4896 53d208b0-2cfd-4366-9866-c3c6114081bc \n", + "\n", + " dataset_title dataset_h5ad_path \\\n", + "304 Tabula Sapiens - Stromal a68b64d8-aee3-4947-81b7-36b8fe5a44d2.h5ad \n", + "305 Tabula Sapiens - Epithelial 97a17473-e2b1-4f31-a544-44a60773e2dd.h5ad \n", + "306 Tabula Sapiens - Immune c5d88abe-f23a-45fa-a534-788985e93dad.h5ad \n", + "307 Tabula Sapiens - Endothelial 5a11f879-d1ef-458a-910c-9b0bdfca5ebf.h5ad \n", + "308 Tabula Sapiens - All Cells 53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad \n", + "\n", + " dataset_total_cell_count \n", + "304 82478 \n", + "305 104148 \n", + "306 264824 \n", + "307 31691 \n", + "308 483152 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Grab the feature's soma_joinid from the var dataframe\n", + "var_joinid = var_df.loc[var_df.feature_id == \"ENSG00000286096\"].soma_joinid\n", + "\n", + "# The presence matrix is indexed by the joinids of the dataset and var dataframes,\n", + "# so slice out the feature of interest by its joinid.\n", + "dataset_joinids = presence_matrix[:, var_joinid].tocoo().row\n", + "\n", + "# From the datasets dataframe, slice out the datasets which have a joinid in the list\n", + "datasets_df.loc[datasets_df.soma_joinid.isin(dataset_joinids)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What features are in a dataset?\n", + "\n", + "*Goal:* lookup the features present in a given dataset.\n", + "\n", + "This example also demonstrates the ability to do the query on multiple datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
soma_joinidfeature_idfeature_namefeature_length
00ENSG00000121410A1BG3999
11ENSG00000268895A1BG-AS13374
22ENSG00000148584A1CF9603
33ENSG00000175899A2M6318
44ENSG00000245105A2M-AS12948
...............
4464444644ENSG00000219926OR7E104P4672
4464844648ENSG00000267104TBC1D3P1-DHX40P11841
4464944649ENSG00000265766CXADRP31955
4465144651ENSG00000267453CLEC4O774
4465444654ENSG00000279274RP11-533E23.275
\n", + "

27211 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " soma_joinid feature_id feature_name feature_length\n", + "0 0 ENSG00000121410 A1BG 3999\n", + "1 1 ENSG00000268895 A1BG-AS1 3374\n", + "2 2 ENSG00000148584 A1CF 9603\n", + "3 3 ENSG00000175899 A2M 6318\n", + "4 4 ENSG00000245105 A2M-AS1 2948\n", + "... ... ... ... ...\n", + "44644 44644 ENSG00000219926 OR7E104P 4672\n", + "44648 44648 ENSG00000267104 TBC1D3P1-DHX40P1 1841\n", + "44649 44649 ENSG00000265766 CXADRP3 1955\n", + "44651 44651 ENSG00000267453 CLEC4O 774\n", + "44654 44654 ENSG00000279274 RP11-533E23.2 75\n", + "\n", + "[27211 rows x 4 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Slice the dataset(s) of interest, and get the joinid(s)\n", + "dataset_joinids = datasets_df.loc[datasets_df.collection_id == \"17481d16-ee44-49e5-bcf0-28c0780d8c4a\"].soma_joinid\n", + "\n", + "# Slice the presence matrix by the first dimension, i.e., by dataset\n", + "var_joinids = presence_matrix[dataset_joinids, :].tocoo().col\n", + "\n", + "# From the feature (var) dataframe, slice out features which have a joinid in the list.\n", + "var_df.loc[var_df.soma_joinid.isin(var_joinids)]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_datasets.ipynb b/api/python/notebooks/census_datasets.ipynb new file mode 100644 index 000000000..bb7f1bd72 --- /dev/null +++ b/api/python/notebooks/census_datasets.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Census Datasets example\n", + "\n", + "*Goal:* demonstrate basic use of the `census_datasets` dataframe.\n", + "\n", + "Each Cell Census contains a top-level dataframe itemizing the datasets contained therein. You can read this into a Pandas DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
collection_idcollection_namecollection_doidataset_iddataset_titledataset_h5ad_pathdataset_total_cell_count
soma_joinid
003f821b4-87be-4ff4-b65a-b5fc00061da7Local and systemic responses to SARS-CoV-2 inf...10.1038/s41586-021-04345-xedc8d3fe-153c-4e3d-8be0-2108d30f8d70Airwayedc8d3fe-153c-4e3d-8be0-2108d30f8d70.h5ad236977
103f821b4-87be-4ff4-b65a-b5fc00061da7Local and systemic responses to SARS-CoV-2 inf...10.1038/s41586-021-04345-x2a498ace-872a-4935-984b-1afa70fd9886PBMC2a498ace-872a-4935-984b-1afa70fd9886.h5ad422220
243d4bb39-21af-4d05-b973-4c1fed7b916cTranscriptional Programming of Normal and Infl...10.1016/j.celrep.2018.09.006f512b8b6-369d-4a85-a695-116e0806857fSkinf512b8b6-369d-4a85-a695-116e0806857f.h5ad68036
30434a9d4-85fd-4554-b8e3-cf6c582bb2faAcute COVID-19 cohort across a range of WHO ca...10.1101/2020.11.20.20227355fa8605cf-f27e-44af-ac2a-476bee4410d3PBMCsfa8605cf-f27e-44af-ac2a-476bee4410d3.h5ad59506
43472f32d-4a33-48e2-aad5-666d4631bf4cA single-cell transcriptome atlas of the adult...10.15252/embj.2018100811d5c67a4e-a8d9-456d-a273-fa01adb1b308Retinad5c67a4e-a8d9-456d-a273-fa01adb1b308.h5ad19694
........................
347f70ebd97-b3bc-44fe-849d-c18e08fe773dA transcriptomic atlas of the mouse cerebellum...10.1101/2020.03.04.976407e0ed3c55-aff6-4bb7-b6ff-98a2d90b890cA transcriptomic atlas of the mouse cerebellume0ed3c55-aff6-4bb7-b6ff-98a2d90b890c.h5ad611034
3485d445965-6f1a-4b68-ba3a-b8f765155d3aA molecular cell atlas of the human lung from ...10.1038/s41586-020-2922-4e04daea4-4412-45b5-989e-76a9be070a89Krasnow Lab Human Lung Cell Atlas, Smart-seq2e04daea4-4412-45b5-989e-76a9be070a89.h5ad9409
3495d445965-6f1a-4b68-ba3a-b8f765155d3aA molecular cell atlas of the human lung from ...10.1038/s41586-020-2922-48c42cfd0-0b0a-46d5-910c-fc833d83c45eKrasnow Lab Human Lung Cell Atlas, 10X8c42cfd0-0b0a-46d5-910c-fc833d83c45e.h5ad65662
35017481d16-ee44-49e5-bcf0-28c0780d8c4aSingle-Cell Sequencing of Developing Human Gut...10.1016/j.devcel.2020.11.0108e47ed12-c658-4252-b126-381df8d52a3dPaediatric Human Gut (4-14y)8e47ed12-c658-4252-b126-381df8d52a3d.h5ad22502
35117481d16-ee44-49e5-bcf0-28c0780d8c4aSingle-Cell Sequencing of Developing Human Gut...10.1016/j.devcel.2020.11.010b46237d1-19c6-4af2-9335-9854634bad16Fetal Human Gut (6-11 PCW)b46237d1-19c6-4af2-9335-9854634bad16.h5ad62849
\n", + "

352 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " collection_id \\\n", + "soma_joinid \n", + "0 03f821b4-87be-4ff4-b65a-b5fc00061da7 \n", + "1 03f821b4-87be-4ff4-b65a-b5fc00061da7 \n", + "2 43d4bb39-21af-4d05-b973-4c1fed7b916c \n", + "3 0434a9d4-85fd-4554-b8e3-cf6c582bb2fa \n", + "4 3472f32d-4a33-48e2-aad5-666d4631bf4c \n", + "... ... \n", + "347 f70ebd97-b3bc-44fe-849d-c18e08fe773d \n", + "348 5d445965-6f1a-4b68-ba3a-b8f765155d3a \n", + "349 5d445965-6f1a-4b68-ba3a-b8f765155d3a \n", + "350 17481d16-ee44-49e5-bcf0-28c0780d8c4a \n", + "351 17481d16-ee44-49e5-bcf0-28c0780d8c4a \n", + "\n", + " collection_name \\\n", + "soma_joinid \n", + "0 Local and systemic responses to SARS-CoV-2 inf... \n", + "1 Local and systemic responses to SARS-CoV-2 inf... \n", + "2 Transcriptional Programming of Normal and Infl... \n", + "3 Acute COVID-19 cohort across a range of WHO ca... \n", + "4 A single-cell transcriptome atlas of the adult... \n", + "... ... \n", + "347 A transcriptomic atlas of the mouse cerebellum... \n", + "348 A molecular cell atlas of the human lung from ... \n", + "349 A molecular cell atlas of the human lung from ... \n", + "350 Single-Cell Sequencing of Developing Human Gut... \n", + "351 Single-Cell Sequencing of Developing Human Gut... \n", + "\n", + " collection_doi \\\n", + "soma_joinid \n", + "0 10.1038/s41586-021-04345-x \n", + "1 10.1038/s41586-021-04345-x \n", + "2 10.1016/j.celrep.2018.09.006 \n", + "3 10.1101/2020.11.20.20227355 \n", + "4 10.15252/embj.2018100811 \n", + "... ... \n", + "347 10.1101/2020.03.04.976407 \n", + "348 10.1038/s41586-020-2922-4 \n", + "349 10.1038/s41586-020-2922-4 \n", + "350 10.1016/j.devcel.2020.11.010 \n", + "351 10.1016/j.devcel.2020.11.010 \n", + "\n", + " dataset_id \\\n", + "soma_joinid \n", + "0 edc8d3fe-153c-4e3d-8be0-2108d30f8d70 \n", + "1 2a498ace-872a-4935-984b-1afa70fd9886 \n", + "2 f512b8b6-369d-4a85-a695-116e0806857f \n", + "3 fa8605cf-f27e-44af-ac2a-476bee4410d3 \n", + "4 d5c67a4e-a8d9-456d-a273-fa01adb1b308 \n", + "... ... \n", + "347 e0ed3c55-aff6-4bb7-b6ff-98a2d90b890c \n", + "348 e04daea4-4412-45b5-989e-76a9be070a89 \n", + "349 8c42cfd0-0b0a-46d5-910c-fc833d83c45e \n", + "350 8e47ed12-c658-4252-b126-381df8d52a3d \n", + "351 b46237d1-19c6-4af2-9335-9854634bad16 \n", + "\n", + " dataset_title \\\n", + "soma_joinid \n", + "0 Airway \n", + "1 PBMC \n", + "2 Skin \n", + "3 PBMCs \n", + "4 Retina \n", + "... ... \n", + "347 A transcriptomic atlas of the mouse cerebellum \n", + "348 Krasnow Lab Human Lung Cell Atlas, Smart-seq2 \n", + "349 Krasnow Lab Human Lung Cell Atlas, 10X \n", + "350 Paediatric Human Gut (4-14y) \n", + "351 Fetal Human Gut (6-11 PCW) \n", + "\n", + " dataset_h5ad_path \\\n", + "soma_joinid \n", + "0 edc8d3fe-153c-4e3d-8be0-2108d30f8d70.h5ad \n", + "1 2a498ace-872a-4935-984b-1afa70fd9886.h5ad \n", + "2 f512b8b6-369d-4a85-a695-116e0806857f.h5ad \n", + "3 fa8605cf-f27e-44af-ac2a-476bee4410d3.h5ad \n", + "4 d5c67a4e-a8d9-456d-a273-fa01adb1b308.h5ad \n", + "... ... \n", + "347 e0ed3c55-aff6-4bb7-b6ff-98a2d90b890c.h5ad \n", + "348 e04daea4-4412-45b5-989e-76a9be070a89.h5ad \n", + "349 8c42cfd0-0b0a-46d5-910c-fc833d83c45e.h5ad \n", + "350 8e47ed12-c658-4252-b126-381df8d52a3d.h5ad \n", + "351 b46237d1-19c6-4af2-9335-9854634bad16.h5ad \n", + "\n", + " dataset_total_cell_count \n", + "soma_joinid \n", + "0 236977 \n", + "1 422220 \n", + "2 68036 \n", + "3 59506 \n", + "4 19694 \n", + "... ... \n", + "347 611034 \n", + "348 9409 \n", + "349 65662 \n", + "350 22502 \n", + "351 62849 \n", + "\n", + "[352 rows x 7 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import cell_census\n", + "from cell_census.experiment_query import experiment_query, AxisQuery\n", + "\n", + "census = cell_census.open_soma()\n", + "census_datasets = census[\"census_info\"][\"datasets\"].read_as_pandas_all()\n", + "\n", + "# for convenience, indexing on the soma_joinid which links this to other census data.\n", + "census_datasets = census_datasets.set_index(\"soma_joinid\")\n", + "census_datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The sum cells across all datasets should match the number of cells across all SOMA experiments (human, mouse)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Count by experiment:\n", + "\t34115852 cells in homo_sapiens\n", + "\t3409719 cells in mus_musculus\n", + "\n", + "Found 37525571 cells in all experiments.\n", + "Found 37525571 cells in all datasets.\n" + ] + } + ], + "source": [ + "# Count cells across all experiments\n", + "all_experiments = (\n", + " (organism_name, organism_experiment) for organism_name, organism_experiment in census[\"census_data\"].items()\n", + ")\n", + "experiments_total_cells = 0\n", + "print(\"Count by experiment:\")\n", + "for organism_name, organism_experiment in all_experiments:\n", + " num_cells = len(organism_experiment.obs.read_as_pandas_all(column_names=[\"soma_joinid\"]))\n", + " print(f\"\\t{num_cells} cells in {organism_name}\")\n", + " experiments_total_cells += num_cells\n", + "\n", + "print(f\"\\nFound {experiments_total_cells} cells in all experiments.\")\n", + "\n", + "# Count cells across all datasets\n", + "print(f\"Found {census_datasets.dataset_total_cell_count.sum()} cells in all datasets.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets pick one dataset to slice out of the census, and turn into an [AnnData](https://anndata.readthedocs.io/en/latest/) in-memory object. This can be used with the [ScanPy](https://scanpy.readthedocs.io/en/stable/) toolchain. You can also save this AnnData locally using the AnnData [`write`](https://anndata.readthedocs.io/en/latest/api.html#writing) API." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
collection_idcollection_namecollection_doidataset_iddataset_titledataset_h5ad_pathdataset_total_cell_count
soma_joinid
1590b9d8a04-bb9d-44da-aa27-705bb65b54ebTabula Muris Senis10.1038/s41586-020-2496-10bd1a1de-3aee-40e0-b2ec-86c7a30c7149Bone marrow - A single-cell transcriptomic atl...0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad40220
\n", + "
" + ], + "text/plain": [ + " collection_id collection_name \\\n", + "soma_joinid \n", + "159 0b9d8a04-bb9d-44da-aa27-705bb65b54eb Tabula Muris Senis \n", + "\n", + " collection_doi dataset_id \\\n", + "soma_joinid \n", + "159 10.1038/s41586-020-2496-1 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149 \n", + "\n", + " dataset_title \\\n", + "soma_joinid \n", + "159 Bone marrow - A single-cell transcriptomic atl... \n", + "\n", + " dataset_h5ad_path \\\n", + "soma_joinid \n", + "159 0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad \n", + "\n", + " dataset_total_cell_count \n", + "soma_joinid \n", + "159 40220 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "census_datasets[census_datasets.dataset_id == \"0bd1a1de-3aee-40e0-b2ec-86c7a30c7149\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a query on the mouse experiment, \"RNA\" measurement, for the dataset_id." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 40220 × 52373\n", + " obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id'\n", + " var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mouse = census[\"census_data\"][\"mus_musculus\"]\n", + "with experiment_query(\n", + " mouse, \"RNA\", obs_query=AxisQuery(value_filter=\"dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'\")\n", + ") as query:\n", + " adata = query.read_as_anndata(\"raw\")\n", + "\n", + "adata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also use the `cell_census.get_h5ad_uri()` API to fetch a URI pointing to the H5AD associated with this `dataset_id`. This is the same H5AD you can download from the CELLxGENE Portal, and may contain additional data-submittor provided information which was not included in the Cell Census.\n", + "\n", + "The \"locator\" returned by this API will include a `uri` and additional information that may be necessary to use the URI (eg, the S3 region).\n", + "\n", + "You will need to use a download API to fetch this H5AD, such as [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'uri': 's3://cellxgene-data-public/cell-census/2022-11-29/h5ads/0bd1a1de-3aee-40e0-b2ec-86c7a30c7149.h5ad',\n", + " 's3_region': 'us-west-2'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cell_census.get_source_h5ad_uri(\"0bd1a1de-3aee-40e0-b2ec-86c7a30c7149\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_high_variable_genes.ipynb b/api/python/notebooks/census_high_variable_genes.ipynb new file mode 100644 index 000000000..97c50e1ca --- /dev/null +++ b/api/python/notebooks/census_high_variable_genes.ipynb @@ -0,0 +1,351 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Computing on X: Highly Variable Genes\n", + "\n", + "*Goal:* demonstrate larger-than-core computation on X.\n", + "\n", + "This demo finds highly variable genes in a user-specified cell selection. It is similar to the [scanpy.pp.high_variable_genes](https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.highly_variable_genes.html) function, when called with `flavor='seurat_v3'`.\n", + "\n", + "*NOTE*: when query results are small, it may be easier to use the SOMAExperment Query class to extract an AnnData, and then just compute over that. This notebook is showing means of incrementally processing larger-than-core (RAM) data, where incremental (online) algorithms are used." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cell_census\n", + "\n", + "census = cell_census.open_soma()\n", + "human = census[\"census_data\"][\"homo_sapiens\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from cell_census.experiment_query import ExperimentQuery\n", + "from cell_census.compute import OnlineMatrixMeanVariance\n", + "\n", + "\n", + "def highly_variable_genes(query: ExperimentQuery, n_top_genes: int = 10) -> pd.DataFrame:\n", + " \"\"\"\n", + " Acknowledgements: scanpy highly variable genes implementation, github.com/scverse/scanpy\n", + " \"\"\"\n", + "\n", + " try:\n", + " import skmisc.loess\n", + " except ImportError:\n", + " raise ImportError(\"Please install skmisc package via `pip install --user scikit-misc\")\n", + "\n", + " var_df = query.var(column_names=[\"soma_joinid\", \"feature_id\", \"feature_name\"]).to_pandas().set_index(\"soma_joinid\")\n", + "\n", + " indexer = query.get_indexer()\n", + " mvn = OnlineMatrixMeanVariance(query.n_obs, query.n_vars)\n", + " for arrow_tbl in query.X(\"raw\"):\n", + " var_dim = indexer.var_index(arrow_tbl[\"soma_dim_1\"])\n", + " data = arrow_tbl[\"soma_data\"].to_numpy()\n", + " mvn.update(var_dim, data)\n", + "\n", + " u, v = mvn.finalize()\n", + " var_df = var_df.assign(means=pd.Series(u, index=var_df.index), variances=pd.Series(v, index=var_df.index))\n", + "\n", + " estimated_variances = np.zeros((len(var_df),), dtype=np.float64)\n", + " not_const = v > 0\n", + " y = np.log10(v[not_const])\n", + " x = np.log10(u[not_const])\n", + " model = skmisc.loess.loess(x, y, span=0.3, degree=2)\n", + " model.fit()\n", + " estimated_variances[not_const] = model.outputs.fitted_values\n", + " reg_std = np.sqrt(10**estimated_variances)\n", + "\n", + " # A second pass over the data is required because the clip value\n", + " # is determined by the first pass\n", + " N = query.n_obs\n", + " vmax = np.sqrt(N)\n", + " clip_val = reg_std * vmax + u\n", + " counts_sum = np.zeros((query.n_vars,), dtype=np.float64) # clipped\n", + " squared_counts_sum = np.zeros((query.n_vars,), dtype=np.float64) # clipped\n", + " for arrow_tbl in query.X(\"raw\"):\n", + " var_dim = indexer.var_index(arrow_tbl[\"soma_dim_1\"])\n", + " data = arrow_tbl[\"soma_data\"].to_numpy()\n", + " # clip\n", + " mask = data > clip_val[var_dim]\n", + " data = data.copy()\n", + " data[mask] = clip_val[var_dim[mask]]\n", + " np.add.at(counts_sum, var_dim, data)\n", + " np.add.at(squared_counts_sum, var_dim, data**2)\n", + "\n", + " norm_gene_vars = (1 / ((N - 1) * np.square(reg_std))) * (\n", + " (N * np.square(u)) + squared_counts_sum - 2 * counts_sum * u\n", + " )\n", + " norm_gene_vars = norm_gene_vars.reshape(1, -1)\n", + "\n", + " # argsort twice gives ranks, small rank means most variable\n", + " ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1), axis=1)\n", + "\n", + " # this is done in SelectIntegrationFeatures() in Seurat v3\n", + " ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32)\n", + " num_batches_high_var = np.sum((ranked_norm_gene_vars < n_top_genes).astype(int), axis=0)\n", + " ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan\n", + " ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars)\n", + " median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan)\n", + "\n", + " var_df = var_df.assign(\n", + " highly_variable_nbatches=pd.Series(num_batches_high_var, index=var_df.index),\n", + " highly_variable_rank=pd.Series(median_ranked, index=var_df.index),\n", + " variances_norm=pd.Series(np.mean(norm_gene_vars, axis=0), index=var_df.index),\n", + " )\n", + "\n", + " sorted_index = (\n", + " var_df[[\"highly_variable_rank\", \"highly_variable_nbatches\"]]\n", + " .sort_values(\n", + " [\"highly_variable_rank\", \"highly_variable_nbatches\"],\n", + " ascending=[True, False],\n", + " na_position=\"last\",\n", + " )\n", + " .index\n", + " )\n", + " var_df[\"highly_variable\"] = False\n", + " var_df = var_df.drop(columns=[\"highly_variable_nbatches\"])\n", + " var_df.loc[sorted_index[: int(n_top_genes)], \"highly_variable\"] = True\n", + " return var_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use this funtion, which is also available in `cell_census.compute`, open a ExperimentQuery, and pass to the function as a parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_idfeature_namemeansvarianceshighly_variable_rankvariances_normhighly_variable
soma_joinid
14173ENSG00000011465DCN1.4698092.799015e+049.019.943697True
17553ENSG00000234745HLA-B2.5932486.411800e+045.020.669138True
17949ENSG00000185885IFITM15.7268844.180694e+050.026.027514True
18020ENSG00000163453IGFBP79.2720801.508529e+061.025.324222True
22441ENSG00000111341MGP3.3343182.661684e+057.020.188246True
28680ENSG00000112562SMOC20.5455295.397097e+038.020.102556True
29507ENSG00000105467SYNGR40.8833613.483089e+046.020.313564True
40758ENSG00000253504MTCYBP195.5585383.219079e+052.024.388447True
42651ENSG00000270225MTCO2P228.8554299.335230e+053.023.784311True
52214ENSG00000232179MTATP6P296.4756137.000857e+054.021.696998True
\n", + "
" + ], + "text/plain": [ + " feature_id feature_name means variances \\\n", + "soma_joinid \n", + "14173 ENSG00000011465 DCN 1.469809 2.799015e+04 \n", + "17553 ENSG00000234745 HLA-B 2.593248 6.411800e+04 \n", + "17949 ENSG00000185885 IFITM1 5.726884 4.180694e+05 \n", + "18020 ENSG00000163453 IGFBP7 9.272080 1.508529e+06 \n", + "22441 ENSG00000111341 MGP 3.334318 2.661684e+05 \n", + "28680 ENSG00000112562 SMOC2 0.545529 5.397097e+03 \n", + "29507 ENSG00000105467 SYNGR4 0.883361 3.483089e+04 \n", + "40758 ENSG00000253504 MTCYBP19 5.558538 3.219079e+05 \n", + "42651 ENSG00000270225 MTCO2P22 8.855429 9.335230e+05 \n", + "52214 ENSG00000232179 MTATP6P29 6.475613 7.000857e+05 \n", + "\n", + " highly_variable_rank variances_norm highly_variable \n", + "soma_joinid \n", + "14173 9.0 19.943697 True \n", + "17553 5.0 20.669138 True \n", + "17949 0.0 26.027514 True \n", + "18020 1.0 25.324222 True \n", + "22441 7.0 20.188246 True \n", + "28680 8.0 20.102556 True \n", + "29507 6.0 20.313564 True \n", + "40758 2.0 24.388447 True \n", + "42651 3.0 23.784311 True \n", + "52214 4.0 21.696998 True " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from cell_census.experiment_query import experiment_query, AxisQuery\n", + "\n", + "with experiment_query(human, measurement_name=\"RNA\", obs_query=AxisQuery(value_filter=\"tissue == 'brain'\")) as query:\n", + " hvg = highly_variable_genes(query)\n", + "\n", + "display(hvg[hvg.highly_variable])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_query_extract.ipynb b/api/python/notebooks/census_query_extract.ipynb new file mode 100644 index 000000000..5ad237a0a --- /dev/null +++ b/api/python/notebooks/census_query_extract.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cell Census query & extract subsets\n", + "\n", + "_Goal:_ demonstrate the ability to query subsets of the Cell Census based upon user-defined obs/var metadata, and extract those slices into in-memory data structures for further analysis.\n", + "\n", + "**NOTE:** all examples in this notebook assume that sufficient memory exists on the host machine to store query results. There are other notebooks which provide examples for out-of-core processing." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cell_census\n", + "\n", + "census = cell_census.open_soma(census_version=\"latest\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Cell Census includes SOMA Experiments for both human and mouse. These experiments can be queried based upon metadata values (eg, tissue type), and the query result can be extracted into a variety of formats.\n", + "\n", + "> ⚠️ **NOTE:** The following is experimental query code. It is is built upon SOMA, but not (yet) part of SOMA. If it becomes sufficiently useful, we plan to propose it as a SOMA extension.\n", + "\n", + "Basic idea:\n", + "\n", + "- define per-axis (i.e., obs, var) query criteria\n", + "- specify the experiment and measurement name to be queried\n", + "- specify the column names you want as part of the results\n", + "- and read the query result _into an in-memory format_.\n", + "\n", + "This utilizes the SOMA `value_filter` query language. Keep in mind that the results must fit into memory, so it is best to define a selective query _and_ only fetch those axis metadata columns which are necessary.\n", + "\n", + "The `cell_census` package includes a convenience function to extract a slice of the Census and read into an [AnnData](https://anndata.readthedocs.io/en/latest/), for use with [ScanPy](https://scanpy.readthedocs.io/en/stable/). This function accepts a variety of arguments, including:\n", + "* the organism to slice\n", + "* the per-axis slice criteria\n", + "* the columns to fetch and include in the AnnData\n", + "\n", + "For more complex query scenarios, there is an advanced query API demonstrated in other notebooks." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 119269 × 60564\n", + " obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id'\n", + " var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Define a simple obs-axis query for all cells where tissue is UBERON:0001264 and sex is PATO:0000383.\n", + "\n", + "adata = cell_census.get_anndata(\n", + " census,\n", + " \"Homo sapiens\",\n", + " obs_query={\n", + " \"tissue_ontology_term_id\": \"UBERON:0002048\",\n", + " \"sex_ontology_term_id\": \"PATO:0000383\",\n", + " \"cell_type_ontology_term_id\": [\"CL:0002063\", \"CL:0000499\"],\n", + " },\n", + ")\n", + "\n", + "display(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 41332 × 3\n", + " obs: 'tissue', 'cell_type', 'sex'\n", + " var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# You can also query on both axis. This example adds a var-axis query for a handful of genes, and queries the mouse experiment.\n", + "\n", + "adata = cell_census.get_anndata(\n", + " census,\n", + " \"Mus musculus\",\n", + " obs_query={\"tissue\": \"brain\"},\n", + " var_query={\"feature_name\": [\"Gm16259\", \"Dcaf5\", \"Gm53058\"]},\n", + " column_names={\"obs\": [\"tissue\", \"cell_type\", \"sex\"]},\n", + ")\n", + "\n", + "display(adata)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_rank_gene_groups.ipynb b/api/python/notebooks/census_rank_gene_groups.ipynb new file mode 100644 index 000000000..3e277579a --- /dev/null +++ b/api/python/notebooks/census_rank_gene_groups.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cell Census - demo ScanPy rank_gene_groups\n", + "\n", + "_Goal_: demonstrate a simple student's t-test between two medium-size (i.e., all of the extracted data fits into memory) \"obs\" metadata queries/slices, using [scanpy.tl.rank_gene_groups](https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html#scanpy-tl-rank-genes-groups).\n", + "\n", + "**NOTE:** all examples in this notebook assume that sufficient memory exists on the host machine to store query results. There are other notebooks which provide examples for out-of-core processing." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cell_census\n", + "from cell_census.experiment_query import AxisQuery, experiment_query\n", + "\n", + "census = cell_census.open_soma(census_version=\"latest\")\n", + "human = census[\"census_data\"][\"homo_sapiens\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a first step, query and read a slice of data into an AnnData. In this example, lung cells (UBERON:0002048) labelled as fibroblast (CL:0000057) and natural killer cells (CL:0000623) are selected." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 195477 × 60564\n", + " obs: 'cell_type_ontology_term_id', 'tissue_ontology_term_id'\n", + " var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with experiment_query(\n", + " human,\n", + " \"RNA\",\n", + " obs_query=AxisQuery(\n", + " value_filter=\"tissue_ontology_term_id == 'UBERON:0002048' and cell_type_ontology_term_id in ['CL:0000057', 'CL:0000623']\"\n", + " ),\n", + ") as query:\n", + " adata = query.read_as_anndata(\n", + " X_name=\"raw\",\n", + " column_names={\n", + " \"obs\": [\n", + " \"cell_type_ontology_term_id\",\n", + " \"tissue_ontology_term_id\",\n", + " ],\n", + " \"var\": None,\n", + " },\n", + " )\n", + "\n", + "adata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the data read into an AnnData, use the ScanPy API to compute the rank genes groups." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABCsAAAHHCAYAAACWddIPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAACpqUlEQVR4nOzdeVyN+fs/8Fe7tVJp1aZCFCVLWYrKGDIYSfbdMMMwMobsYx9jmIyPbabsMyTDGEOkGvsuhLKEoiRKGy3K/fujX+frqEzd56STXs/H4zw+p/u+z3Wuc47P3Pe5zvt9vZUEQRBARERERERERKQglKs6ASIiIiIiIiKit7FYQUREREREREQKhcUKIiIiIiIiIlIoLFYQERERERERkUJhsYKIiIiIiIiIFAqLFURERERERESkUFisICIiIiIiIiKFwmIFERERERERESkUFiuIiIiIiIiISKGwWEFERERERERECoXFCiIZxcXFYfz48WjcuDFq1aoFTU1NdOzYEQEBAcjJyZEcZ2FhgV69eol+njNnzqBTp06oU6cODA0NMXnyZGRnZ5c4Li8vDzNmzICxsTFq166N9u3bIywsTKFi/vvvv1BSUir1du7cOclxDx8+LPM4JSUljBs3riJv4Ufl0KFDWLBgQVWnQUREVYjXINLy8/OxdOlSNGvWDLVq1YKBgQG8vLzw+PFjyTEXL17EpEmT0KJFC9StWxdmZmYYMGAA7ty5UyLer7/+Cjc3NxgYGEBDQwOWlpYYNWoUHj58WM537uN069YtLFiwoMa/D1T5VKs6AaLq7J9//oGPjw80NDQwfPhw2NnZIT8/H6dOncL06dNx8+ZNbNq0SebnuXr1Kjw8PGBra4tVq1bh8ePHWLlyJe7evYvDhw9LHTty5EiEhITgm2++gY2NDbZs2YKePXsiMjISnTp1UpiYADB58mS0bdtWapu1tbXkfsOGDbF9+/YSjwsNDcXOnTvxySefVOyN/IgcOnQI//vf/1iwICKqoXgNIh3z9evX8PLywpkzZzBu3Di0bNkSL168wPnz55GRkYFGjRoBAH744QecPn0aPj4+aNmyJZKTk7F27Vq0bt0a586dg52dnSRmVFQULC0t0bt3bzRo0AAPHjzAr7/+ioMHD+LatWswNjaW+f2tjm7duoXvv/8eXbp0gYWFRVWnQx8zgYhEuX//vlCvXj2hWbNmQlJSUon9d+/eFX7++WfJ3+bm5oKXl5eo5+rRo4dgZGQkZGRkSLb9+uuvAgDhyJEjkm3nz58XAAg//vijZFtOTo5gZWUluLi4KEzMyMhIAYCwZ88eMW+H4OHhIWhqago5OTmiHv+hvX79WsjLy5NrzIkTJwr8TzgRUc3Ea5CSMX/44QdBTU1NOH/+/Htfz+nTp0uck+/cuSNoaGgIQ4YMee9jBUEQLl26JAAQli1b9p/HKoI3b94Ir169kmvMPXv2CACEyMhIucYlehevdIlEmjBhggBAOH36dLmOL8+FwsuXL4WYmBjh2bNnkm0ZGRmCqqqqMH36dKlj8/LyhHr16gljxoyRbJs+fbqgoqIidfIXBEFYunSpAEBISEhQiJhvFysyMzOF169fv/d9eVtSUpKgrKwsjBw58r3HJScnCyoqKsKCBQtK7IuNjRUACL/88osgCIKQn58vLFiwQLC2thY0NDQEHR0doWPHjsLRo0fLnVexBw8eSC6sVq9eLTRu3FhQVlYWoqKiBEEQhJiYGMHb21to0KCBoKGhITg5OQl//fWXVIz/ymfEiBECgBI3IiKqGXgNIh2zsLBQMDY2FgYMGCAIQtGPBC9fvnzv631X69athdatW//ncc+fPxcACDNmzHjvcS1atBC6dOlSYntxrt7e3pJtf/zxh9C6dWuhXr16Qv369QU7OzupYlNFFH/WoaGhgpOTk6ChoSGsXr1aEARBePHihTBlyhShUaNGgrq6umBlZSUsX75cKCwslIrxvnw2b95c6jUICxdUGdizgkikv//+G40bN0aHDh3kFvPChQuwtbXF2rVrJduio6NRUFCANm3aSB2rrq4OBwcHREVFSbZFRUWhSZMm0NTUlDq2Xbt2AIqGXSpCzGKjRo2CpqYmatWqha5du+LSpUtlvjfFdu3ahTdv3mDIkCHvPc7AwABubm4IDg4usW/37t1QUVGBj48PAGDBggX4/vvv0bVrV6xduxazZ8+GmZkZrly58p/5lGXz5s345Zdf8MUXX+Cnn36Cjo4Obt68CWdnZ8TExGDmzJn46aefULduXfTt2xf79u2TPPa/8hk/fjy6desGANi+fbvkRkRENQOvQaRj3rp1C0lJSWjZsiW++OIL1K1bF3Xr1kXLli0RGRn5n69dEAQ8ffoUenp6pe5PTU1FSkoKLl26hFGjRgEAPDw83hvT19cXJ06cQHJystT2U6dOISkpCQMHDgQAhIWFYdCgQWjQoAF++OEHLF++HF26dMHp06f/M++y3L59G4MGDUK3bt0QEBAABwcHvHr1Cm5ubtixYweGDx+ONWvWoGPHjvD394efn5/ksf+Vj6urKyZPngwAmDVrluQaxNbWVnS+RGVhzwoiETIzM5GYmIg+ffpU+nM9efIEAGBkZFRin5GREU6ePCl1bFnHAUBSUpJCxFRXV4e3tzd69uwJPT093Lp1CytXrkTnzp1x5swZODo6lohRbOfOnTAyMoK7u3uZxxTz9fXF+PHjcePGDak5qLt375Y0zAKK5v327NlTLnN7iz1+/Bj37t1Dw4YNJds8PT1hZmaGixcvQkNDAwDw1VdfoVOnTpgxYwY+//zzcuXj4uKCJk2aICwsDEOHDpVbzkREpPh4DVIy5t27dwEAq1evho6ODjZu3AgAWLp0KT799FNcvHgRLVu2LPN17ty5E4mJiVi4cGGp+01MTJCXlwcA0NXVxZo1ayQ/GpTF19cX8+bNQ0hICCZNmiTZvnv3btSrVw9eXl4Ais75mpqaOHLkCFRUVN4bs7zu3buH0NBQdO/eXbJt8eLFiIuLQ1RUFGxsbAAU/fhhbGyMH3/8EdOmTYOpqel/5tO4cWN07txZ8h506dJFLjkTlYYjK4hEyMzMBADUr19frnG7dOkCQRCkmiYWd/Mu/nL7tlq1akl1+87JySnzuLdjVXXMDh06ICQkBKNHj0bv3r0xc+ZMnDt3DkpKSvD39y/x+GJ37tzB5cuXMXDgQCgr//d/vvr16wdVVVXs3r1bsu3GjRu4desWfH19Jdu0tbVx8+ZNycWOPHh7e0sVKtLS0hAREYEBAwYgKysLz58/x/Pnz5Gamoru3bvj7t27SExMrLR8iIjo48BrkJIxi1cRycrKQnh4OEaOHImRI0fi2LFjEAQBK1asKPN1x8bGYuLEiXBxccGIESNKPebw4cM4dOgQfvrpJ5iZmeHly5dlxivWpEkTODg4SF2DFBYWIiQkBJ999hlq164NoOic//LlyzJXOBHD0tJSqlABAHv27EHnzp3RoEEDyTXI8+fP4enpicLCQpw4caLS8iESi8UKIhGKhyNmZWVV+nMVn8yKK/pvy83NlewvPras496OVdUxS2NtbY0+ffogMjIShYWFpR6zc+dOAPjPKSDF9PT04OHhITUVZPfu3VBVVUW/fv0k2xYuXIj09HQ0adIE9vb2mD59Oq5fv16u5yiLpaWl1N/37t2DIAiYO3cuGjZsKHWbP38+ACAlJaXS8iEioo8Dr0HKjtmxY0eYmppKjjMzM0OnTp1w5syZUl9fcnIyvLy8oKWlhZCQkDJHNnTt2hU9evSAn58f9uzZg++//15qukxZfH19cfr0acmPEf/++y9SUlKkfjD56quv0KRJE/To0QONGjXC6NGjERoa+p+x3+fdaxCgaPRJaGhoiWsQT09PAP93DVIZ+RCJxWIFkQiampowNjbGjRs3Kv25ioc6Fg+bfNuTJ0+kls0yMjIq8zgAkmOrOmZZTE1NkZ+fX+YvFr///juaNm0KJyen/4xVbODAgbhz545kXmtwcDA8PDyk5qW6uroiLi4OQUFBsLOzw2+//YbWrVvjt99+K/fzvOvd4sybN28AAN9++y3CwsJKvRUv21oZ+RAR0ceB1yAlYxb/b/H0zrfp6+vjxYsXJbZnZGSgR48eSE9PR2hoaLmXIbWysoKjo6PkB5T38fX1hSAI2LNnD4CiaxAtLS18+umnUvldvXoVBw4cQO/evREZGYkePXqUOcqjPEr7gejNmzfo1q1bmdcg3t7elZYPkVgsVhCJ1KtXL8TFxeHs2bOV+jx2dnZQVVUt0XwyPz8fV69ehYODg2Sbg4MD7ty5IxkiWuz8+fOS/YoQsyz3799HrVq1UK9evRL7zp8/j3v37pV7VEWxvn37Ql1dHbt378bVq1dx584dSVOrt+no6GDUqFH4448/8OjRI7Rs2VJqKKysGjduDABQU1ODp6dnqbe3h/T+Vz5KSkpyy42IiKoXXoNIx7S3t4eamppkBMPbkpKSpKZlAkUjMz777DPcuXMHBw8eRPPmzd//RrwjJycHGRkZ/3mcpaUl2rVrh927d6OgoAB//vkn+vbtW2Jqi7q6Oj777DOsW7cOcXFxGD9+PLZt24Z79+5VKK/3sbKyQnZ2dpnXIGZmZuXOh9cg9KGwWEEk0nfffYe6deti7NixePr0aYn9cXFxCAgIqFDMV69eITY2Fs+fP5ds09LSgqenJ3bs2CE15HP79u3Izs6WrGgBAP3790dhYaFUY8a8vDxs3rwZ7du3lwyNrOqYz549K/Har127hgMHDuCTTz4ptR/F77//DgAYPHjw+97CErS1tdG9e3cEBwdj165dUFdXR9++faWOSU1Nlfq7Xr16sLa2lhp6mpGRgdjY2HJdnJRGX18fXbp0wcaNG0v9lejt96Q8+dStWxcAkJ6eLiofIiKqvngNIh2zfv366NmzJ86cOYPY2FjJsTExMThz5oxUM8zCwkL4+vri7Nmz2LNnD1xcXEp9PwoKCkodkXHhwgVER0eXWM2kLL6+vjh37hyCgoLw/PlzqSkgQMlzvrKysqQZaPF5//Xr14iNjS31+qG8BgwYgLNnz+LIkSMl9qWnp6OgoKDc+fAahD4UJUEQhKpOgqi6OnDgAHx9fVG7dm0MHz4cdnZ2yM/Px5kzZ7Bnzx6MHDlS0pHawsICtWrVKnX1BkdHR3h5eeHff/9F165dMX/+fKlf0a9cuYIOHTqgefPm+OKLL/D48WP89NNPcHV1LXHSGTBgAPbt24epU6fC2toaW7duxYULFxAeHg5XV1eFiOnu7o7atWujQ4cO0NfXx61bt7Bp0yaoqanh7NmzJZa/KiwshImJCSwtLUX9irRz504MHToU9evXR5cuXXDgwAGp/QYGBujSpQucnJygo6ODS5cuYdOmTZg0aRLWrFkDANiyZQtGjRqFzZs3Y+TIkWU+18OHD2FpaYkff/wR3377rdS+W7duoVOnTlBWVsa4cePQuHFjPH36FGfPnsXjx49x7dq1cuezZ88eDBgwAMOGDUP37t2hoqJS6ogRIiL6OPEaRDrmrVu30L59e9SvX1+ytOaaNWtQUFCAqKgomJiYAAC++eYbBAQE4LPPPsOAAQNKvB/F71F6ejoaNWoEX19ftGjRAnXr1kV0dDQ2b96MWrVq4dy5c5JVNd7n8ePHMDMzQ7169aCmpobk5GSoqalJ9n/++edIS0uDu7s7GjVqhPj4ePzyyy+wsLDA5cuXoaysLLm2GDFiBLZs2fLe57OwsICdnR0OHjwotf3Vq1fo3Lkzrl+/jpEjR8LJyQkvX75EdHQ0QkJC8PDhQ+jp6ZUrn+TkZDRq1Aht27bFhAkToKGhAXd3d+jr6//n+0FUIQIRyeTOnTvCuHHjBAsLC0FdXV2oX7++0LFjR+GXX34RcnNzJceZm5sLAEq9jRkzRhAEQYiMjBQACPPnzy/xPCdPnhQ6dOgg1KpVS2jYsKEwceJEITMzs8RxOTk5wrfffisYGhoKGhoaQtu2bYXQ0NBSc6+qmAEBAUK7du0EHR0dQVVVVTAyMhKGDh0q3L17t9SYoaGhAgBhzZo1pe7/L5mZmULt2rUFAMKOHTtK7F+8eLHQrl07QVtbW6hdu7bQrFkzYcmSJUJ+fr7kmM2bNwsAhM2bN7/3uR48eCAAEH788cdS98fFxQnDhw8XDA0NBTU1NcHExETo1auXEBISUqF8CgoKhK+//lpo2LChoKSkJPA/50RENQ+vQaRdvnxZ8PT0FOrWrSvUr19f6NOnj3Dnzh2pY9zc3Mp8L94+l+bl5QlTpkwRWrZsKWhqagpqamqCubm5MGbMGOHBgwelPn9ZOnbsKAAQxo4dW2JfSEiI8Mknnwj6+vqCurq6YGZmJowfP1548uSJ5Jjia4sRI0b853OZm5sLXl5epe7LysoS/P39BWtra0FdXV3Q09MTOnToIKxcuVJyjVGefARBEH799VehcePGgoqKigBAiIyMLP8bQlROHFlBRERERERERAqFPSuIiIiIiIiISKGwWEFERERERERECoXFCiIiIiIiIiJSKCxWEBEREREREZFCYbGCiIiIiIiIiBQKixVEREREREREpFBUqzoBRfDmzRskJSWhfv36UFJSqup0iIiIKpUgCMjKyoKxsTGUlfm7RVXiNQgREdU05b0OYbECQFJSEkxNTas6DSIiog/q0aNHaNSoUVWnUaPxGoSIiGqq/7oOYbECQP369QEUvVmamppVnA0REVHlyszMhKmpqeT8R1WH1yBERFTTlPc6hMUKQDLsUlNTkxcKRERUY3DaQdXjNQgREdVU/3UdwomqRERERERERKRQWKwgIiIiIiIiIoXCYgURERERERERKRQWK4iIiIiIiIhIobBYQUREREREREQKhcUKIiIiIiIiIlIoLFYQERERERERkUJhsYKIiIiIiIiIFAqLFURERERERESkUFisICIiIiIiIiKFwmIFERERERERESkUFiuIiIiIiIiISKGwWEFERERERERECoXFCiIiIiIiIiJSKCxWEBEREREREZFCYbGCiIiIiIiIiBQKixVEREREREREpFBYrCAiIiIiIiIihcJiBREREREREREpFBYriIiIiIiIiEihsFhBRERERERERAqlSosVhYWFmDt3LiwtLVG7dm1YWVlh0aJFEARBcowgCJg3bx6MjIxQu3ZteHp64u7du1Jx0tLSMGTIEGhqakJbWxtjxoxBdnb2h345REREVE3wGoSIiEixVWmx4ocffsD69euxdu1axMTE4IcffsCKFSvwyy+/SI5ZsWIF1qxZgw0bNuD8+fOoW7cuunfvjtzcXMkxQ4YMwc2bNxEWFoaDBw/ixIkT+OKLL6riJREREVE1wGsQIiIixaYkvP0TwgfWq1cvGBgYIDAwULLN29sbtWvXxo4dOyAIAoyNjTFt2jR8++23AICMjAwYGBhgy5YtGDhwIGJiYtC8eXNcvHgRbdq0AQCEhoaiZ8+eePz4MYyNjf8zj8zMTGhpaSEjIwOampqV82KJiIgUBM97vAYhIiKqKuU991XpyIoOHTogPDwcd+7cAQBcu3YNp06dQo8ePQAADx48QHJyMjw9PSWP0dLSQvv27XH27FkAwNmzZ6GtrS25SAAAT09PKCsr4/z586U+b15eHjIzM6VuREREVHPwGoSIiEixqVblk8+cOROZmZlo1qwZVFRUUFhYiCVLlmDIkCEAgOTkZACAgYGB1OMMDAwk+5KTk6Gvry+1X1VVFTo6OpJj3rVs2TJ8//338n45REREVE3wGoSIiEixVenIiuDgYOzcuRO///47rly5gq1bt2LlypXYunVrpT6vv78/MjIyJLdHjx5V6vMRERGRYuE1CBERkWKr0pEV06dPx8yZMzFw4EAAgL29PeLj47Fs2TKMGDEChoaGAICnT5/CyMhI8rinT5/CwcEBAGBoaIiUlBSpuAUFBUhLS5M8/l0aGhrQ0NCohFdERERE1QGvQYiIiBRblY6sePXqFZSVpVNQUVHBmzdvAACWlpYwNDREeHi4ZH9mZibOnz8PFxcXAICLiwvS09Nx+fJlyTERERF48+YN2rdv/wFeBREREVU3vAYhIiJSbFU6suKzzz7DkiVLYGZmhhYtWiAqKgqrVq3C6NGjAQBKSkr45ptvsHjxYtjY2MDS0hJz586FsbEx+vbtCwCwtbXFp59+inHjxmHDhg14/fo1Jk2ahIEDB5arCzcRERHVPLwGISIiUmxVunRpVlYW5s6di3379iElJQXGxsYYNGgQ5s2bB3V1dQCAIAiYP38+Nm3ahPT0dHTq1Anr1q1DkyZNJHHS0tIwadIk/P3331BWVoa3tzfWrFmDevXqlSsPLhtGREQ1Cc97vAYhIiKqKuU991VpsUJR8EKBiIhqEp73FAc/CyIiqmnKe+6r0p4VRERERERERETvYrGCiIiIiIiIiBQKixVEREREREREpFBYrCAiIiIiIiIihcJiBREREREREREpFBYriIiIiIiIiEihsFhBRERERERERAqFxQoiIiIiIiIiUigsVhARERERERGRQmGxgoiIiIiIiIgUCosVRERERERERKRQWKwgIiIiIiIiIoXCYgURERERERERKRQWK4iIiIiIiIhIobBYQUREREREREQKhcUKIiIiIiIiIlIoLFYQERERERERkUJhsYKIiIiIiIiIFIpqVSdAREREROK8ePEC+/btQ0JCAgDAzMwMffv2hY6OThVnRkREJBuOrCAiIiKqhvbu3YtmzZrh6NGjyMnJQU5ODo4cOYLmzZtj7969VZ0eERGRTDiygoiIiKgamj17Ns6fPw8LCwup7Q8ePECPHj3g7e1dNYkRERHJAUdWEBEREVVDhYWFJQoVAGBpaYmCgoIPnxAREZEcsVhBREREVA21bdsWo0ePxoULF/D06VM8ffoUFy5cwOjRo9GmTZuqTo+IiEgmLFYQERERVUOBgYGwtLTE6NGjYWNjAxsbG4wZMwbm5uYICgqq6vSIiIhkoiQIglDVSVS1zMxMaGlpISMjA5qamlWdDhERUaXieU9x8LMgIqKaprznPo6sICIiIqqmYmNjkZKSIrkfGBiICxcuVHFWREREsmOxgoiIiKga+vHHH+Hm5oY2bdpgx44d+OSTT3DkyBH4+PggICCgqtMjIiKSCZcuJSIiIqqGtmzZgtjYWGRnZ6NZs2a4ceMGLC0t8fz5c3Tp0gVTpkyp6hSJiIhEY7GCiIiIqBrS0NBAgwYN0KBBA+jp6cHS0hIAoKenBzU1tSrOjoiISDacBkJERERUDWloaOCff/7Bjh07oKSkhN27dwMAIiMjoaKiUsXZERERyYYjK4iIiIiqoTVr1uCLL76AsrIy/vrrLyxfvhwjRoxAvXr1EBwcXNXpERERyYTFCiIiIqJqqG3btoiKipL8/ccffyA1NRUNGjSAsjIHzxIRUfXGMxkRERHRR0JZWZmFCiIi+ijwbEZERERUDb29POmDBw/QokULGBsbw9LSEtHR0VWYGRERkexYrCAiIiKqhrZu3Sq5P2vWLHz11VfIycnBypUr4efnV4WZERERyY7FCiIiIqJq7tatW5g4cSIAwNvbG8+ePavijIiIiGTDBptERERE1VB6ejr+/vtvCIKA169fS+0TBKGKsiIiIpIPFiuIiIiIqiEzMzOsWrUKAGBgYIDExESYmJggJSUF6urqVZwdERGRbFisICIiIqqG/v3331K36+rq4vjx4x82GSIiIjljzwoiIiKij0BBQQGioqKQnZ2NOnXqVHU6REREMmGxgoiIiKgaioiIgK6uLvT09HD8+HF06NABgwcPhpWVFUdWEBFRtcdpIERERETVkL+/P8LDw5Geng5vb28EBwfD3d0dFy5cwLRp03Dy5MmqTpGIiEg0FiuIiIiIqqH8/Hw4ODgAALS1teHu7g4AaNeuHbKzs6swMyIiItlV6TQQCwsLKCkplbgVrxOem5uLiRMnQldXF/Xq1YO3tzeePn0qFSMhIQFeXl6oU6cO9PX1MX36dBQUFFTFyyEiIqJq4mO4Bnnz5o3kvo+Pj9S+wsLCD5YHERFRZajSYsXFixfx5MkTyS0sLAzA/51wp06dir///ht79uzB8ePHkZSUhH79+kkeX1hYCC8vL+Tn5+PMmTPYunUrtmzZgnnz5lXJ6yEiIqLq4WO4BnFyckJmZiYAYNmyZZLtcXFx0NTU/GB5EBERVQYlQRCEqk6i2DfffIODBw/i7t27yMzMRMOGDfH777+jf//+AIDY2FjY2tri7NmzcHZ2xuHDh9GrVy8kJSXBwMAAALBhwwbMmDEDz549K/ca45mZmdDS0kJGRgZP7kRE9NHjea+kj+kapKCgAAUFBahVq5Zc4hEREclTec99CrMaSH5+Pnbs2IHRo0dDSUkJly9fxuvXr+Hp6Sk5plmzZjAzM8PZs2cBAGfPnoW9vb3kIgEAunfvjszMTNy8ebPM58rLy0NmZqbUjYiIiGqm6noNcv/+fXTt2hWNGzeGn58fcnNzAQCqqqro2rWr6LhERESKQGGKFfv370d6ejpGjhwJAEhOToa6ujq0tbWljjMwMEBycrLkmLcvEor3F+8ry7Jly6ClpSW5mZqayu+FEBERUbVSXa9BvvzyS/Tv3x979uzB8+fP4eHhgaysLACQFC6IiIiqK4UpVgQGBqJHjx4wNjau9Ofy9/dHRkaG5Pbo0aNKf04iIiJSTNX1GiQlJQUTJ06Ek5MTtm3bBi8vL3h4eCAjIwNKSkpyzJqIiOjDU4ilS+Pj43Hs2DH8+eefkm2GhobIz89Henq61C8bT58+haGhoeSYCxcuSMUq7tRdfExpNDQ0oKGhIcdXQERERNVRdb4GycnJkfp71qxZUFdXlxphQUREVF0pxMiKzZs3Q19fH15eXpJtTk5OUFNTQ3h4uGTb7du3kZCQABcXFwCAi4sLoqOjkZKSIjkmLCwMmpqaaN68+Yd7AURERFQtVedrEFtbW4SGhkpt+/bbbzF48GDExcV9kByIiIgqS5WvBvLmzRtYWlpi0KBBWL58udS+L7/8EocOHcKWLVugqamJr7/+GgBw5swZAEXLhjk4OMDY2BgrVqxAcnIyhg0bhrFjx2Lp0qXlzoFd0YmIqCbhea9Idb8GycvLA4BSR2okJibCxMSkQvGIiIg+hPKe+6p8GsixY8eQkJCA0aNHl9i3evVqKCsrw9vbG3l5eejevTvWrVsn2a+iooKDBw/iyy+/hIuLC+rWrYsRI0Zg4cKFH/IlEBERUTVU3a9B3jedhIUKIiKq7qp8ZIUi4C9MRERUk/C8pzj4WRARUU1T3nOfQvSsICIiIiIiIiIqxmIFERERERERESkUFiuIiIiIiIiISKGwWEFERERERERECoXFCiIiIiIiIiJSKCxWEBEREREREZFCYbGCiIiIiIiIiBQKixVEREREREREpFBYrCAiIiIiIiIihcJiBREREREREREpFBYriIiIiIiIiEihsFhBRERERERERAqFxQoiIiIiIiIiUigsVhARERERERGRQmGxgoiIiIiIiIgUCosVRERERERERKRQWKwgIiIiIiIiIoXCYgURERERERERKRQWK4iIiIiIiIhIobBYQUREREREREQKhcUKIiIiIiIiIlIoqlWdABEREREplsLCQhw/fhwJCQkAADMzM7i5uUFFRaWKMyMiopqCxQoiIiIikjh58iQGDx4MExMTmJubAwAePnyIpKQk7Ny5E66urlWcIRER1QQsVhARERGRxMSJE7Fv3z60adNGavvFixcxevRoREdHV1FmRERUk7BnBRERERFJ5ObmlihUAEDbtm2Rl5dXBRkREVFNxGIFEREREUlYWVlh4cKFSElJkWxLSUnB999/D0tLyyrMjIiIahIWK4iIiIhIYtu2bYiPj4eVlRVq166N2rVrw8rKCvHx8di+fXtVp0dERDUEe1YQERERkUTDhg0RGBiIwMBApKWlAQB0dHSqOCsiIqppOLKCiIiIiEqlo6MjVahITU2twmyIiKgmYbGCiIiIiMrF0dGxqlMgIqIagtNAiIiIiEjiwIEDZe7Lzc39gJkQEVFNxmIFEREREUl8/vnncHNzgyAIJfZlZWVVQUZERFQTsVhBRERERBI2NjYICgqChYVFiX2mpqYfPiEiIqqR2LOCiIiIiCRGjBiB58+fl7pvwoQJHzgbIiKqqZSE0sb41TCZmZnQ0tJCRkYGNDU1qzodIiKiSsXznuLgZ0FERDVNec99HFlBRERERFJevnyJgoICAEBaWhrCw8Px+PHjKs6KiIhqEhYriIiIiEhi27Zt0NPTg6WlJSIiImBnZwd/f384ODhg9+7dVZ0eERHVEGywSUREREQSK1euRGxsLDIyMuDq6opjx46hTZs2uHfvHry9veHr61vVKRIRUQ3AYgURERERSaioqMDc3BwAoK2tjTZt2gAArK2toazMQblERPRh8IxDRERERBLKysq4efMmTp06hZcvX+L06dMAgNjYWBQWFlZxdkREVFNwZAURERERSSxatAiurq5QVlbGrl27MGfOHCQlJSE5ORmbNm2q6vSIiKiGYLGCiIiIiCR69uyJ1NRUyd9dunTB1atXYWpqCn19/SrMjIiIapIqnwaSmJiIoUOHQldXF7Vr14a9vT0uXbok2S8IAubNmwcjIyPUrl0bnp6euHv3rlSMtLQ0DBkyBJqamtDW1saYMWOQnZ39oV8KERERVSO8BimftLQ0ZGRkcAoIERF9UFVarHjx4gU6duwINTU1HD58GLdu3cJPP/2EBg0aSI5ZsWIF1qxZgw0bNuD8+fOoW7cuunfvjtzcXMkxQ4YMwc2bNxEWFoaDBw/ixIkT+OKLL6riJREREVE1wGuQsg0fPhwpKSkAgIiICDRv3hwzZ85Eq1atsH///qpNjoiIagwlQRCEqnrymTNn4vTp0zh58mSp+wVBgLGxMaZNm4Zvv/0WAJCRkQEDAwNs2bIFAwcORExMDJo3b46LFy9KulWHhoaiZ8+eePz4MYyNjf8zj8zMTGhpaSEjIwOamprye4FEREQKiOc9XoO8T6tWrXDt2jUAgJubGwICAuDg4IAHDx6gX79+iIqKquIMiYioOivvua9KR1YcOHAAbdq0gY+PD/T19eHo6Ihff/1Vsv/BgwdITk6Gp6enZJuWlhbat2+Ps2fPAgDOnj0rtawWAHh6ekJZWRnnz5//cC+GiIiIqg1eg5QtJydHcv/Vq1dwcHAAAFhaWnIqCBERfTBVWqy4f/8+1q9fDxsbGxw5cgRffvklJk+ejK1btwIAkpOTAQAGBgZSjzMwMJDsS05OLtHsSVVVFTo6OpJj3pWXl4fMzEypGxEREdUcvAYpW/fu3TFlyhRkZ2fD09MTO3fuhCAIOHz4MPT09Ko6PSIiqiGqtFjx5s0btG7dGkuXLoWjoyO++OILjBs3Dhs2bKjU5122bBm0tLQkN1NT00p9PiIiIlIsvAYp208//QRlZWWYmJhg165dGDZsGNTV1REQEIDAwMCqTo+IiGqIKi1WGBkZoXnz5lLbbG1tkZCQAAAwNDQEADx9+lTqmKdPn0r2GRoaSppAFSsoKEBaWprkmHf5+/sjIyNDcnv06JFcXg8RERFVD7wGKZu6ujpWr16NJ0+e4MCBA7h8+TKSk5MRGhoKS0vLqk6PiIhqiCotVnTs2BG3b9+W2nbnzh2Ym5sDKJobaWhoiPDwcMn+zMxMnD9/Hi4uLgAAFxcXpKen4/Lly5JjIiIi8ObNG7Rv377U59XQ0ICmpqbUjYiIiGoOXoP8tzp16sDe3h6Ojo7Q1dUFADRp0qSKsyIioppCtSqffOrUqejQoQOWLl2KAQMG4MKFC9i0aRM2bdoEAFBSUsI333yDxYsXw8bGBpaWlpg7dy6MjY3Rt29fAEW/gnz66aeSoZuvX7/GpEmTMHDgwHJ14SYiIqKah9cgZbt+/XqZ+7Kysj5gJkREVJNV6dKlAHDw4EH4+/vj7t27sLS0hJ+fH8aNGyfZLwgC5s+fj02bNiE9PR2dOnXCunXrpCr7aWlpmDRpEv7++28oKyvD29sba9asQb169cqVgyIuG0ZERFRZeN4rwmuQ0ikrK8PCwgKlXSImJiYiPz+/CrIiIqKPRXnPfVVerFAEinihQEREVFl43lMcivhZWFpa4vTp06WODjE1NVXIPhtERFR9lPfcV6U9K4iIiIhIsfTu3Rv3798vdZ+Xl9cHzoaIiGqqKu1ZQURERESKJSAgoMx9lb20KxERUTGOrCAiIiIiIiIihcJiBREREREREREpFBYriIiIiIiIiEihsFhBRERERERERAqFxQoiIiIiIiIiUigsVhARERERERGRQmGxgoiIiIiIiIgUCosVRERERERERKRQWKwgIiIiIiIiIoXCYgURERERERERKRQWK4iIiIiIiIhIobBYQUREREREREQKhcUKIiIiIiIiIlIoLFYQERERERERkUJhsYKIiIiIiIiIFAqLFURERERERESkUFisICIiIiIiIiKFwmIFERERERERESkUFiuIiIiIiIiISKGwWEFERERERERECoXFCiIiIiIiIiJSKCxWEBEREREREZFCYbGCiIiIiIiIiBQKixVEREREREREpFBYrCAiIiIiIiIihSK6WJGeno7ffvsN/v7+SEtLAwBcuXIFiYmJckuOiIiIiIiIiGoeVTEPun79Ojw9PaGlpYWHDx9i3Lhx0NHRwZ9//omEhARs27ZN3nkSERERERERUQ0hamSFn58fRo4cibt376JWrVqS7T179sSJEyfklhwRERERERER1TyiihUXL17E+PHjS2w3MTFBcnKyzEkRERERERERUc0lqlihoaGBzMzMEtvv3LmDhg0bypwUEREREREREdVcoooVvXv3xsKFC/H69WsAgJKSEhISEjBjxgx4e3vLNUEiIiIiIiIiqllEFSt++uknZGdnQ19fHzk5OXBzc4O1tTXq16+PJUuWyDtHIiIiIvpIXbt2rapTICIiBSRqNRAtLS2EhYXh9OnTuHbtGrKzs9G6dWt4enrKOz8iIiIi+oh99tlnSEhIqOo0iIhIwVS4WPH69WvUrl0bV69eRceOHdGxY8fKyIuIiIiIPhJr1qwpdbsgCMjOzq6U5zx48CB69epVKbGJiKjyVbhYoaamBjMzMxQWFlZGPkRERET0kZk2bRqGDBkCJSWlEvuKe6DJ21dffcViBRFRNSZqGsjs2bMxa9YsbN++HTo6OvLOiYiIiIg+Ira2tvD390fTpk1L7Dt27JjouH5+fqVuFwQBGRkZouMSEVHVE1WsWLt2Le7duwdjY2OYm5ujbt26UvuvXLkil+SIiIiIqPqbOnUq8vPzS923ePFi0XHXrVuH7777DioqKiX2lTaKg4iIqg9RxYq+ffvKOQ0iIiIi+liNGjWqzH0jRowQHdfOzg4+Pj6wt7cvse+3334THZeIiKqeqGLF/Pnz5Z0HEREREdUQr169wu3bt2FtbY369euLjrNw4ULUrl271H1bt24VHfdtBQUFiI6ORuPGjaGlpSWXmERE9N+UZXnw5cuXsWPHDuzYsQNRUVHyyomIiIiIPiIzZsyQ3L927Rqsra0xaNAgWFlZ4cSJE6Lj9uzZE9bW1qXuc3d3FxUzIiICurq60NPTw/Hjx9GhQwcMHjwYVlZWOH78uOhciYioYkQVK1JSUuDu7o62bdti8uTJmDx5MpycnODh4YFnz57JO0ciIiIiqsbCwsIk9+fOnYt169YhNjYWBw4cgL+/v0yxc3JykJCQUGL7zZs3RcXz9/dHeHg4QkJC4O3tjeXLlyMmJgaHDh3CnDlzZMqViIjKT1Sx4uuvv0ZWVhZu3ryJtLQ0pKWl4caNG8jMzMTkyZPLHWfBggVQUlKSujVr1kyyPzc3FxMnToSuri7q1asHb29vPH36VCpGQkICvLy8UKdOHejr62P69OkoKCgQ87KIiIiohuA1SNVJSEiQ9D9zdnbGq1evRMc6evQojIyMYG9vj9atW+PevXuSfcOGDRMVMz8/Hw4ODujSpQu0tbUlIzTatWuH7Oxs0bkSEVHFiOpZERoaimPHjsHW1layrXnz5vjf//6HTz75pEKxWrRoIbVklarq/6U0depU/PPPP9izZw+0tLQwadIk9OvXD6dPnwYAFBYWwsvLC4aGhjhz5gyePHmC4cOHQ01NDUuXLhXz0oiIiKiG4DXIh5OSkoI1a9ZAEARkZWVJ7Xvz5o3ouHPmzMGJEydgb2+PoKAgeHp64uDBg7Czs4MgCKJivp2Pj4+P1L7CwkLRuRIRUcWIKla8efMGampqJbarqalV+ISjqqoKQ0PDEtszMjIQGBiI33//XVLR3rx5M2xtbXHu3Dk4Ozvj6NGjuHXrFo4dOwYDAwM4ODhg0aJFmDFjBhYsWAB1dXUxL4+IiIhqAF6DfDjdunWT9DdzdXXFkydPYGRkhMTEROjr64uO+/r1a7Rs2RIAMGbMGFhYWKBXr17466+/RC9d6uTkhMzMTGhqamLZsmWS7XFxcdDU1BSdKxERVYyoaSDu7u6YMmUKkpKSJNsSExMxdepUeHh4VCjW3bt3YWxsjMaNG2PIkCGSOYeXL1/G69ev4enpKTm2WbNmMDMzw9mzZwEAZ8+ehb29PQwMDCTHdO/eHZmZmaLnKRIREVHNwGuQD2fz5s1SNyMjIwCAiYmJVD+LisrNzUVeXp7kbw8PD2zduhW9e/fGkydPRMUMCgoqtShhbm4uU65ERFQxoooVa9euRWZmJiwsLGBlZQUrKytYWloiMzMTv/zyS7njtG/fHlu2bEFoaCjWr1+PBw8eoHPnzsjKykJycjLU1dWhra0t9RgDAwMkJycDAJKTk6UuEor3F+8rS15eHjIzM6VuREREVHPwGuTDys/Pl5qWcejQIcybNw979+6VKW6/fv3w77//Sm1zc3PD9u3bZVoStTSqqqoy9dcgIqKKETUNxNTUFFeuXMGxY8cQGxsLALC1tZX6BaI8evToIbnfsmVLtG/fHubm5ggODi5zzWx5WLZsGb7//vtKi09ERESKjdcgH1b79u1x7Ngx6Orq4pdffsHGjRvRs2dP/PDDD7hx4wbmz58vKu6SJUtK3e7q6oo7d+7IknKpHB0dS115hIiI5E9UsQIAlJSU0K1bN3Tr1k1uyWhra6NJkya4d+8eunXrhvz8fKSnp0v9svH06VPJ/FJDQ0NcuHBBKkZxp+7S5qAW8/f3h5+fn+TvzMxMmJqayu11EBERUfXCa5DKVVhYCF1dXQDA9u3bcfz4cejq6iInJwft2rUTXawo9ubNGygrSw8YfvHiBRo0aFDhWAcOHChzX25uboXjFXvw4AEePXqEtm3bShXEwsLC5Ho9TUT0sRBVrJg8eTKsra1LLFO6du1a3Lt3Dz///LOoZLKzsxEXF4dhw4bByckJampqCA8Ph7e3NwDg9u3bSEhIgIuLCwDAxcUFS5YsQUpKiqQ5U1hYGDQ1NdG8efMyn0dDQwMaGhqiciQiIqKPD69BKldBQQGys7NRr149qKurQ0dHBwBQu3ZtmVYDuXTpEnx8fJCUlISePXti06ZNaNiwIYCi/hVXrlypcMzPP/8cbm5upa4m8u5KJuW1c+dOfPPNNzA0NER6ejqCg4Ml/5ZmzJghU7EiJiYGf/zxh2TEh5mZGXx9fdGiRQvRMYmIFIGonhV79+5Fx44dS2zv0KEDQkJCyh3n22+/xfHjx/Hw4UOcOXMGn3/+OVRUVDBo0CBoaWlhzJgx8PPzQ2RkJC5fvoxRo0bBxcUFzs7OAIBPPvkEzZs3x7Bhw3Dt2jUcOXIEc+bMwcSJE2vkhQARERGVD69BPqxJkyahR48eOHbsGLy8vDBhwgT8+++/mDFjBtq2bSs67tSpU7F27VokJSXBzs4Orq6uSExMBADRS5fa2NggKCgIkZGRJW56enqiYv7444+IiopCdHQ0Nm/eDF9fX0RERMiUJwD873//Q48ePZCXl4f27dujffv2yMvLg5eXF9auXSs6LhGRQhBE0NDQEO7evVti+927dwUNDY1yx/H19RWMjIwEdXV1wcTERPD19RXu3bsn2Z+TkyN89dVXQoMGDYQ6deoIn3/+ufDkyROpGA8fPhR69Ogh1K5dW9DT0xOmTZsmvH79ukKvJyMjQwAgZGRkVOhxRERE1RHPe7wGqQr79+8XOnXqJOjo6AiampqCvb29sGzZMiE3N1d0TAcHB6m/t2/fLjRp0kRISEgQHB0dRcVcunSpcPHixVL3LV68WFTMli1bSv19/fp1wcLCQjh69KjoPAVBEGxsbIS0tLQS21NTUwVra2vRcYmIKlN5z31KglDxcq6dnR0mTJiASZMmSW3/5ZdfsH79ety6dUsuhZQPJTMzE1paWsjIyOD62URE9NHjeU9x8LOQTdOmTRETEyPVr2L37t2YO3cu8vLyEB8fX4XZ/R97e3ucOXNGaoWSW7duwcvLC5mZmUhNTRUV19raGvfu3Sux/c2bN7CxsUFcXJyouDdu3ICdnZ2oxxIR/ZfynvtE9azw8/PDpEmT8OzZM7i7uwMAwsPDsXLlSgQEBIjLmIiIiIg+avJshAkAHTt2xKFDh9CrVy/JNl9fXygpKWHo0KEy5VqsoKAA0dHRaNy4MbS0tETF+OKLL3Dp0iV07dpVsq158+Y4dOgQpkyZIjq3Hj16oFu3bhg3bhzMzc0BAPHx8fj111/Rs2dP0XFbtmwJe3t7jBkzBkOHDpX0GCEi+pBEjawAgPXr12PJkiVISkoCAFhaWmL+/PkYPny4XBP8EPirBhER1SQ87ymOmvJZvK8RZuvWrUU1wqwsERER8PHxgZKSEvbu3Yvp06cjKysLz549w969e+Hm5lbVKUoIgoDt27cjODhYqsGmj48Phg0bVqIwVF729vZYsGABAgMDceLECfTq1Qtjx46Fp6enPNMnohqqvOc+Uf8Fy8nJwYgRI/D48WM8ffoU169fx6RJk2BgYCA6YSIiIiL6OFVGI0wASE9Pl1OG/8ff3x/h4eEICQmBt7c3li9fjpiYGBw6dAhz5syR2/OMGjVK5hhKSkoYPnw4Dh48iOvXr+P69es4ePAgRowYIbpQAQBqamrw9vbGoUOHEBMTAzs7O4wfPx4WFhZYuHChzHkTEZWHqGkgffr0Qb9+/TBhwgSoqanB09MTampqeP78OVatWoUvv/xS3nkSERERUTWVnZ0NLy8vAMCiRYvQtGlTuLu749ixY1BSUhIdV19fHz169MDYsWPh5eUl0xf0Yvn5+XBwcAAAaGtrS6Y8t2vXDtnZ2aJi+vn5ldi2b98+yfSXVatWiUsWRT8iPnv2DGZmZlLbb968KZflS01NTTFnzhzMmTMH4eHhCAoKkjkmEVF5iPov+pUrV9C5c2cAQEhICAwMDBAfH49t27ZhzZo1ck2QiIiIiKq3V69e4c2bN5K/hw4dioULF8LDw0N0c0mgaBqyq6srZsyYgUaNGmHmzJm4c+eOTLm+naePj4/UvsLCQlExN27ciGfPnkFLS0tyU1JSktwX6+jRozAyMoK9vT1at24t1Wxz2LBhouOqq6uXut3DwwM7d+4UHZeIqCJEFStevXol6WZ89OhR9OvXD8rKynB2dlaYrstEREREpBiKG2G+zdfXF4sXL8aTJ09Ex61bty6mTZuGW7duISQkBM+fP0ebNm3g6uqKbdu2iYrp5OSEzMxMAMCyZcsk2+Pi4kT3Fbl06RLi4+Ohq6uL+fPnY/78+dDW1pbcF2vOnDk4ceIE0tPTMXHiRHh6euLGjRsAZJtec+7cOdGPrQh5TIUhoo+XqGkg1tbW2L9/Pz7//HMcOXIEU6dOBQCkpKR81M2hiIiIiKjiSps6sHHjRowfPx4DBgyQy3N06NABHTp0QEBAAHbt2oVNmzaJavxe1jQHc3NzhIWFicrN1tYWERERWLhwIT755BP8+uuvMk1/Kfb69Wu0bNkSADBmzBhYWFigV69e+Ouvv2SOX1hYiOPHj0s17nRzc4OKioqoeJU5FeZt0dHRuHjxIlq2bIk2bdrIJSYRVQ1RxYp58+Zh8ODBmDp1Kjw8PODi4gKgaJSFo6OjXBMkIiIiourtwIEDJbbNnz8fRkZGAIDevXuLilva6IG6detizJgxGDNmjKiYxa5evYqHDx9CVVUVzZs3R+PGjaGqKurSGQCgqqqKhQsX4vz58/D29sbLly9lyg8AcnNzkZeXBw0NDQBF0zS2bt2K3r17Iz8/X3TckydPYvDgwTAxMZEsifrw4UMkJSVh586dcHV1rXDMjRs3ol+/frC2tpZsK54KIwsPDw/88ccf0NfXR3BwMKZOnYqOHTti4cKF8Pf3x/jx42WKT0RVR/TSpcnJyXjy5AlatWolaWZ04cIFaGpqolmzZnJNsrLVlGXDiIiIAJ73FElN+SyUlZXh4uIi1Qvh3LlzcHZ2hpKSEiIiIkTFTUtLg46OjrzSBABcv34dQ4YMQUJCArKzs9G8eXMkJibCw8MDgYGBcvmccnJyEBcXBzs7O5nizJ49G66urujevbvU9hMnTmDs2LGi+3e0bNkSQUFBJUYmXLx4EaNHj0Z0dHSFY8bExEhG0kyaNAlAUc+RBw8eiMrx7VyvX78OAHBxccGuXbtgbm6OtLQ0dOnSRbKPiBRHpS5dCgCGhoZwdHSU6rrcrl27aleoICIiIqLKFRgYCKBoqH9kZCQiIyNhaGiIyMhI0YUKAO8tVDRp0kRUzAkTJmD9+vXIyMjAvn370LVrVyQnJ6Np06b4+uuvxaaKvLw87N+/Hz///DMCAwPx7Nkz0bGKLVmypEShAgBcXV1lajSam5tb6hSKtm3bIi8vT1TM4qkwKSkp+OSTTxAfHy+XqTB5eXmSxqeCIEhGgujo6MjUt+Ndc+fOlVssIiof8WPZiIiIiIjKYdSoUXB3d8fYsWPRuXNnzJ49Wy5fVN/3q3lWVpaomK9evUKnTp0AFE1PWbRoEdTV1bF48WLRBZDIyEiMHDkS2trauH37Njp37ox169ahXr162LdvH0xMTETFLfbmzZsSy7a+ePFC0g+ioqysrLBw4UJMmDAB+vr6AIp6061fvx6Wlpai86yMqTCDBg2Cr68vli9fjv79+2PJkiUYMmQIDh8+jMaNG4uKWdrqhuvXr0fDhg0BAJMnT5YpZyIqHxYriIiIiKjSmZub4+jRo1i1ahU6d+4s+hf6tzk4OMDCwqLUX9DFLomqpqaG2NhYNGvWDOfOnUPdunUl+8Q2l5w2bRqOHTsGGxsbXLx4Eb/88gvCwsLw66+/YuLEidi/f7+ouJcuXYKPjw+SkpLQs2dPbNq0SfKF2sPDA1euXBEVd9u2bZg5cyasrKxQUFAAoKjQ4OPjg+3bt4uK+bb27dvj5MmTiIuLkznWggULEBAQgK5du+Lp06coKCjAihUrMGjQIGzevFlUTD8/P3h5eUmN3MnLy0NUVJRcimxEVD6ie1Z8TGrKfFEiIiKA5z1FUlM/i5s3b+LkyZOYMGGCTHEsLS1x+vRpGBsbl9hnamqKR48eVThmaGgohg4dCj09PaSmpiIkJARubm5ITk7GvHnzsGnTpgrHdHBwwNWrVyV/Ozk54fLlywCApk2b4vbt2xWOCQCdO3fGzJkz4ezsjJ9//hkhISE4duwYTExM4OjoiKioKFFx35aWlgbg/VNuZHXt2jW0atVK5jhZWVkoKCgQPaKkWEREBPz9/TF37lz06tULgHz6a7yPvN4Douqg0ntWEBERERGJ0aJFC5kLFUDRNI379++Xus/Ly0tUzE8//RT37t3Dzp07ERcXBzc3NwBF/drEFCoAoF69eoiMjAQAhISESKZWyCo7OxteXl7Q1dXFokWLMHv2bLi7u+PRo0dyGwGgo6MDTU1NREVFISMjQy4x3/XZZ5/JJU79+vVRv359mXN1d3dHWFgYgoODMWrUKGRmZlb6iAp5vQdEHxNOAyEiIiKiaikgIKDMfRs2bBAdV1tbG05OTiW2N2nSRFTjytWrV6Nfv354/vw5jIyM8NdffwEoWl1vyJAhovN89eqVVL+KoUOHQk1NDR4eHjJNs4mIiICPjw+UlJSwd+9eTJ8+HVlZWXj27Bn27t0rKeBURGl9IICippjZ2dkKlSsAaGpqYtu2bZKRNTk5OaJzLFZZ7wHRx4rTQFBzh2ASEVHNxPOe4uBnoXje17Sze/fuePLkiejYqamp0NXVFf34d40ePRr9+vWTTFUoFhwcjKFDhyI/P19U3Pbt22Pjxo1IT09H//79ERwcDHd3d1y4cAHTpk3DyZMnKxxTTU0NQ4YMKXWEQkhIiOiGqJWR67uePn2Ky5cvo2fPnjLFqaz3oLCwEMePH0dCQgIAwMzMDG5ubqJ7rBBVtvKe+ziygoiIiIjo/6uMpp3F3i5UyLJaR7GgoKBStw8YMAADBgwQHTc/Px8ODg4AikaZuLu7AwDatWsnegSAra0t/P390bRp0xL7jh07plC5AqUXAAoLC2UqAFTGe3Dy5EkMHjwYJiYmkmVbHz58iKSkJOzcuROurq6i85W3Fy9eYN++fVLvad++fSu1HwpVb+xZQURERET0/5mbm+PUqVN48OBBiZuBgYGomG9PV3nw4AFatGgBY2NjWFpaIjo6WnSu6enpoh/7Pm/evJHc9/HxkdpXWFgoKubUqVPLHOmxePFiUTGBysn15MmTsLCwwKxZs3D48GEcPnwY/v7+sLCwwIkTJ0TnWhnvwcSJE7Fv3z6cO3cOu3fvxu7du3H+/Hn8+eefmDhxouhc3/Xq1StERUWJHv2xd+9eNGvWDEePHkVOTg5ycnJw5MgRNG/eHHv37pVbnvSREUjIyMgQAAgZGRlVnQoREVGl43lPcfCzUDyTJ08WTp48Weq+8ePHi4rp6OgouT9w4EBh7dq1giAIQkhIiODp6SkqpiAIgpqamtC7d2/hwIEDQmFhoeg47xo1alSp/ybv3bsndOzYUW7PIw+Vkau9vb1w8eLFEtsvXLgg2NnZiYpZWWxsbETt+y/fffed5P7Vq1cFIyMjoWnTpkLDhg2F48ePVzhe06ZNhQcPHpTYfv/+faFp06ai8xQEQcjNzRXWr18v/P3334IgCEJgYKAwdOhQYfny5UJeXp6omNHR0TLlRO9X3nMfe1aA80WJiKhm4XlPcfCzqBlat26NK1euAABatWqFa9euSfa9u6xpRTRt2hRffPEFAgMDkZ6ejuHDh2P06NFo0qSJPNIuoaCgAK9fv0bt2rVFx3i7IWgxeUyJeZcsub6vkarYJqtlmTt3LhYtWiT68T169ICLiwsmTJggWWUmJSUF69evx5kzZ3DkyBFRcd/+N9u7d2+MHj0affv2xblz5zBt2jScPn26QvFsbGxw9+7dUvdZW1vj3r17ovIEgFGjRiE5ORmvXr1C8+bNcfv2bfTv3x9HjhyBvr4+fv311wrHVFZWhr29PcaMGYOhQ4dyqoqcsWcFEREREZECSE9Px99//w1BEPD69WupfbL8bli3bl1MmzYN06ZNw5kzZxAUFIQ2bdrAwcEBY8eOxfDhw2XK++rVq3j48CFUVVXRvHlzNG7cGKqq4r4+XLp0CT4+PkhKSkLPnj2xadMmNGzYEADg4eEh+WIsq+joaFy8eBEtW7ZEmzZtRMWwsrLCwoULSy0AWFpais6ttNVA1q9fL3kfJk+eXOGY27Ztw8yZM2FlZYWCggIAgKqqKnx8fLB9+3bRub4tISEBffv2BQA4Ozvj1atXFY7Rtm1bjB49GhMmTJD01oiPj8eGDRtEf07FLly4gBs3biA3NxeGhoZISkpC3bp1MW7cODg6OoqK2aJFC8ybNw+BgYGYNWsWevXqhbFjx8LT01OmXKli2LOCiIiIiKgSmZmZYdWqVVi9ejUMDAyQmJgIoOgLsLq6ulyeo0OHDvjtt9/w5MkTjBgxAps2bRId6/r167C3t4ebmxu8vb3h7++PNm3awMfHB5mZmaJiTp06FWvXrkVSUhLs7Ozg6uoqeR9kKdh4eHggJSUFQNEqKJ9++ilCQ0PRv39/bNy4UVTMbdu2IT4+HlZWVqhduzZq164NKysrxMfHy1QA8PPzQ3h4OKKioiS3vLw8REVFiR5d07BhQwQGBiIrKwuJiYlITExEVlYWgoKCJIUWMVJSUrBmzRoEBASU6FPxdp+Q8goMDISlpSVGjx4NGxsb2NjYYMyYMTA3Ny+zUWx5qaqqQklJCbVq1UKtWrVQt25dAEWrr4htiKqmpgZvb28cOnQIMTExsLOzw/jx42FhYYGFCxeKzjU0NFQysio8PBxz587F7t27Rcf72HEaCDgEk4iIahae9xQHP4uarbCwEHl5eahTp46oxzs6OiIqKkrOWRUVPlasWIFOnTrhwIEDOHbsGFauXImFCxfi0aNH2Lp1q8y57tixA4sWLcKxY8fQp08f0SMrWrZsKVlu1sXFBbt27YK5uTnS0tLQpUuX9y5FWx5paWkAIJdpABEREfD398fcuXMly81aWlriwYMHMsV9d+USc3NzuLq6yrRyyahRo6T+Xrp0KYyMjJCYmIiRI0ciLCxMppzlqXfv3mjSpAmysrJw9+5dtGjRAkOGDEFoaCjOnTuH0NDQCsd8exrM28LDwxEUFISdO3dWOOb06dNx5MgRFBQUYPjw4dixYwd69uyJyMhIdOvWDUuXLq1wzOqqvOc+FivACwUiIqpZeN5THPwsaob79+9j7NixePjwIfr27YulS5eiVq1aAIq+YJ89e1ZU3LS0tEqZS/9uH422bdvi4sWLAMT3bGjatCliYmKk+lXs3r0bc+fORV5eHuLj40Xl2rRpU9y6dQsqKipwdnbGuXPnJPvs7e1Fr7by8uVLaGhoQFVVFWlpaYiKikLTpk3RqFEjUfGKZWZmYtKkSVBRUUFAQAAcHBxw//590fGKly41NjaGhYUFAMVduvRtr169wu3bt2FtbY369evLFOv58+dYsmQJlJSUMHfuXPzxxx9Yv349rKys8PPPP0vel4p499+SPDRv3hxXr17Fy5cv0ahRI8THx0NPTw8vX75Eu3btcPPmTbk+nyIr77mP00CIiIiIiCrRl19+CW9vb+zZswfPnz+Hh4eHZGh9bm6u6LjvK1TI0mRTTU0NsbGxAIBz585JhtUDEP1rfceOHXHo0CGpbb6+vli8eDGePHkiOtdBgwbB19cX9+7dQ//+/bFkyRI8fPgQ69evR+PGjUXF3LZtG/T09GBpaYmIiAjY2dnB398fDg4OMg/Z19TUxLZt29CrVy+4ubkhJydHpnjFS5eeP39e7kuXxsbGSqbYxMbGIjAwEOfPnxcVa8aMGZL7165dg7W1NQYNGgQrKyuZloMFAD09PaxevRqrVq1CgwYN8NVXXyE6Ohr79+8XVagAIPdCBQBoaGhAXV0dDRo0gLa2NvT09AAU9Z6R13Swjw0bbBIRERERVaKUlBTJF8dt27Zh6dKl8PDwQFhYGJSUlETHfd8Uh3f7DFTEokWL0KlTJ+jp6SE1NRUhISEAgOTkZHTu3FlUzLL6EgwYMAADBgwQneuCBQsQEBCArl274unTpygoKMCKFSswaNAgbN68WVTMlStXIjY2FhkZGXB1dcWxY8fQpk0b3Lt3D97e3vD19RWdbzFvb2906tQJly9flilObm5uqQ0q27Zti7y8PNFxf/zxR6xcuRIaGhpYunQpZs2aBWdnZyxcuBB+fn6YMmVKheKFhYXhhx9+AFC0Asq6detkWl3kXU+ePMHu3bslDWHt7OwwaNAgaGhoyBS3WEFBAaKjo9G4cWNoaWmJitGgQQOsXbsWGRkZ0NPTww8//IARI0YgNDRUqiBI/4fFCiIiIiKiSvTur+ezZs2Curq61AgLMRwcHGBhYVFqg8rU1FTRcT/99FPcu3cPcXFxsLGxkQzTNjQ0lKlxZ2WZMmUKpkyZgqysLBQUFMi8DKqKiopkxQptbW1JMcDa2rrEsquy0NXVhZGRETIyMkR/Aa6slUu2bNmC2NhYZGdno1mzZrhx4wYsLS3x/PlzdOnSpcLFirfJY3WRtwUHB+O7775Dq1atcObMGXh6euLmzZtYsGABDh8+DFtb2wrHjIiIgI+PD5SUlLB3715Mnz4dWVlZePbsGfbu3Qs3N7cKx9y0aROmT58OZWVl/P3331i3bh1sbGxgZWUlqg9MTcBpIERERERElcjW1rZEk79vv/0WgwcPRlxcnOi45ubmOHXqFB48eFDiZmBgIFPO2tracHJyKjGfXOz0kri4OHTt2hWNGzeGn5+f1PQXFxcXmXItVr9+fZkLFQCgrKyMmzdv4tSpU3j58qXkV//Y2FgUFhaKjhsREQFdXV3o6enh+PHj6NChAwYPHgwrKyscP35cVMzKWrlEQ0MDDRo0gKmpqWRKDFA05UJNTa3C8eS9usjbFi1ahEuXLuGvv/7C+fPnkZubi8OHD2Pjxo2YNGmSqJj+/v4IDw9HSEgIvL29sXz5csTExODQoUOYM2eOqJjW1tbYt28f9u7dCzMzMyxfvhxZWVm4evUqWrVqJSomANy4cUP0YxUdR1YQEREREVWiXbt2lbrdz89PpikFvXv3xv3792FsbFxin5eXl+i4lTG95KuvvkL//v3h7OyMgIAAeHh4IDQ0FPXr15epb0dcXBzGjh2L+Ph4uTUvXbRoEVxdXaGsrIxdu3Zhzpw5ePLkCZ48eSLTyJLiL8Dp6enw9vZGcHAw3N3dceHCBUybNg0nT56scMzipUsDAwPlunKJhoYG/vnnH7x48QJKSkrYvXs3fH19ERkZKapvSbdu3SSrwbi6uuLJkyeS1UVkWWIVKBoJU9z/oXHjxpJmrd27d8c333wjKmZ+fj4cHBwAFBXu3N3dAQDt2rVDdna2TPkWk1eT0ZYtW8Le3h5jxozB0KFDK6XpblVhsYKIiIiIqBK9b968iYmJ6LgBAQFl7tuwYYPouJUxvaSy+nZURhGkZ8+eUq+zS5cuuHr1KkxNTWX6Yl3ZX4CLv6S+ePFC5hEma9aswRdffAFlZWX89ddfWL58OUaMGIF69eohODi4wvHK6h9iYmIi8zKo+vr62Lx5M3r06IEdO3ZIGqsKgoCCggJRMd8e7eHj4yO1T+zomhkzZkj6dly7dg09evSApqYm0tLSEBISInrllhYtWmDevHkIDAzErFmz0KtXL4wdOxaenp6i4ikSTgMhIiIiIiKJypheUlrfjgEDBsjct6O4COLk5IRt27bBy8sLHh4eyMjIkKkI8jYVFRU0btxY5hEAlfEF+O2C1YMHD9CiRQsYGxvD0tJS9LKtQFGDzqioKFy+fBmtWrXCH3/8gcTERKSkpEiKLBWVl5eH/fv34+eff8batWsRGRkpOr+3rVu3DoGBgbCxscE///yDVatWAQCePXuGmTNniorp5OSEzMxMAMCyZcsk2+Pi4kQvM/12Uaa4yWhsbCwOHDgAf39/UTGBotV7vL29cejQIcTExMDOzg7jx4+HhYUFFi5cKDquImCxgoiIiIiIJIqnl5RG7PSSyurbURlFkKtXr8LBwQGtW7fGzZs34eXlBRMTE5iZmb13isx/qYwvwG83Zpw1axa++uor5OTkYOXKlfDz8xOda3p6eolturq6ohuMRkZGokmTJpg/fz5mzpyJv/76CxMnTkS7du2QmJgoOk+gqBfEqVOnkJWVhcjISJiZmQEoGnExZswYUTGDgoJK/UzMzc1lHgkCyL/JaDFTU1PMmTMHcXFxCAwMxO3bt+US923Xrl2Te8yysFhBREREREQSAQEB6NSpU6n7xE4v2bVrF7p27Vpiu5+fHx49eiQqJlA5RZApU6ZgwYIFmDx5Mnr27ImBAwfi1atXWLNmDb799lvRuVb2F+Bbt25Jptp4e3vj2bNnomPp6+ujT58+OHDggMwNMAFg2rRpOHbsGK5du4aTJ0/CyMgIt27dwrhx4yQ5y9PcuXPlHnPjxo1QVVVF7dq1RT2+spqMqqurl7rdw8MDO3fuFB23LJ999pncY5aFPSuIiIiIiKhSHThwQDL14fnz5xgxYgROnToFR0dHbNu2TXTcymhempmZKfnVe968eRg2bBgAoG/fvliwYIGomEDRaAVtbe0S21VVVaGqKu5rWXp6Ov7++28IgoDXr19L7Sut50h5WVpawtXVFTNnzsSECRMwfPhwjB49WvRqMG/evIGNjQ2AoikmN2/eBACMGzcOK1euFJ0nUNRf413r169Hw4YNAQCTJ0+ucMwDBw6U2DZ//nwYGRkBKBp9VFGV1WT03Llzoh9bltLeU6Do35S8GoyWB4sVRERERERUqZYtWyYpVvj7+8Pe3h6BgYH4/fffMWXKFOzbt09U3Pc1L+3atSvu3LlT4Zhvf8l/dzSILAUAfX199OjRA2PHjoWXl5foKRVvMzMzk/RoMDAwQGJiIkxMTJCSklLmL+7lUbduXUybNg3Tpk3DmTNnEBQUhDZt2sDBwQFjx47F8OHDKxSvXr16iIyMRNeuXRESEiJz/4+3+fn5wcvLS2oVjLy8PERFRYnuW9K3b1+4uLhIvYcZGRlYvXo1lJSURBUrKqvJaFlFMFlMmzYNQ4YMKfX9e7coVplYrCAiIiIiokr19pf8Cxcu4MqVK1BRUYGfn59U34WKqoxlVg0MDJCZmQlNTU2p3J48eSJZFlWM4tEKM2bMwPjx42UerQAA//77b6nbdXV1cfz4cdFx39ahQwd06NABAQEB2LVrFzZt2lThYsXq1avRr18/PH/+HEZGRvjrr78AAMnJyRgyZIhM+R09ehT+/v4YN24cevXqBaDofSmrOFAegYGB+O2337Bq1So4OjoCKPr8ZGkKeuPGDdjZ2Yl+fFkaNmyInj17yrUIZmtrC39/fzRt2rTEvmPHjskcv7yUBFnKgx+JzMxMaGlpISMjQ3RzGyIiouqC5z3Fwc+CagpbW1sEBwdDEAQMHz4cV69elexzcHCQ+rsilJWVy1xmNTExEfn5+SIzLikzMxPp6emSBo4V1bp1a1y5cgUAJKMVgoODRY9WqEyOjo6SKQvylJqaCl1dXbnHzczMxKRJk6CiooKAgAA4ODiU2SS2vOLj4zF27Fh07twZs2fPho2NjUwxlZWVYW9vjzFjxmDo0KFSI0Fk0bRpU3zxxRcIDAxEenq6XIpgmzdvRps2bWBvb19i39atWzFixAhZUi73uY8NNomIiIiIqFLl5OSgT58+6NOnDzIyMvD48WMARUPrZfkluDKWWd2zZ4/k/vPnz+Hl5QUtLS1RQ//L0qFDB/z222948uQJRowYgU2bNomKExcXh65du6Jx48aYOnUqcnNzJftcXFxE5xceHi76se9TWqEiNTVV5riamprYtm0bevXqBTc3txKrxIhhbm6Oo0ePom7duujcuTPy8vJkiteiRQvMmzcPoaGhMDMzw8CBA+UySqF4ys6tW7cQEhKC58+fo02bNnB1dRXdD2bUqFGlFioAyFyoqAgWK4iIiIiIqFI9fPgQ9+/flxQSGjVqBABQU1PD3r17RcetjGVW315WtLi/xu3bt9G7d29MmTJFVEyg9H4XdevWxZgxY3Dq1ClRMb/66iv0798fe/bsQWpqqtSSrW8XLiqqtF/9X7x4ITre+xRPs5AHb29vhIaGIjAwUC7xlJSUMG3aNPz6668yrzCipqYGb29vHDp0CDExMbCzs8P48eNhYWGBhQsXyiVfeRXBgKK+H/v378fPP/+MtWvXyjQFRixOAwGHYBIRUc3C857i4GdBpHjengLRqlUrSX+N4r+vXbsmKm5aWprchv4Xe3e6xtKlS7F//36EhYWha9eukmknFXX16lWMHDkSysrK2L59O7777jtERkZCT08PBw8eRMuWLSsUr7TVNYqNHTsWKSkpovKsTt6eBvS28PBwBAUFiV5mtDKm7ERGRmLkyJHQ1tbG7du30blzZyQmJqJevXrYt28fTExMZIpf3nMfG2wSERERERH9f7m5uYiOjoYgCFBSUpIUKgCIXl0CKH20QrEmTZqIWrnk3ekOs2bNgrq6utQICzGmTJmCBQsWID09HT179sTixYvxzz//YP/+/fj2229x9OjRCsX7/PPP4ebmVuroElnyBIqmwowdOxbx8fHo27cvli5dKmmE6uLigrNnz8oU/11iP6uyVmfx8PCAh4eH6HwqY8rOtGnTcOzYMdjY2ODixYv45ZdfEBYWhl9//RUTJ07E/v375f6cpWGxgoiIiIiI6P8r7q9R/MX68ePHaNSokcz9NSpj5RJbW1uEhobi008/lWz79ttvoaysjG+//VZUTKDol+++ffsCAObNm4dhw4YBKFrSc8GCBRWOZ2Njg6CgIFhYWJTYZ2pqKjpP4P+mwjg7OyMgIAAeHh4IDQ1F/fr1RU+FqYzP6ty5c6Ie91/kPVoHAN68eQMbGxsAQNu2bXHz5k0AwLhx47By5Uq5P19ZWKwgIiIiIiL6/x4+fFjqdln7azg4OJS5conYJpO7du0qdbufnx98fX1FxQSk+2t07dq1zH3lNWLECDx//rzUYsWECRMqHO9tKSkpmDhxIgBg27ZtWLp0KTw8PBAWFiZ6JExlfFYAUFhYiOPHjyMhIQEAYGZmBjc3N6nRO/IkdhRIvXr1EBkZia5duyIkJAT6+vqVkN1/U5gGm8uXL4eSkhK++eYbybbc3FxMnDgRurq6qFevHry9vfH06VOpxyUkJMDLywt16tSBvr4+pk+fjoKCgg+cPREREVVnvA4hov9Sp04dWFpain58ZaxcoqGhAQ0NjVL3vVtkqAgDAwNkZmYCKFqqstiTJ08kUywqwt/fH23atJHatnHjRgDA7NmzRecJlD4VZsCAATJNhamMz+rkyZOwsLDArFmzcPjwYRw+fBj+/v6wsLDAiRMnRMUEikaBlHUT+/pXr16N4cOHo3bt2vjuu++wYsUKAEBycjKGDBkiOteKUoiRFRcvXsTGjRtLNGqZOnUq/vnnH+zZswdaWlqYNGkS+vXrh9OnTwMoqkx5eXnB0NAQZ86cwZMnTzB8+HCoqalh6dKlVfFSiIiIqJrhdQgRfQjFK5cYGxuX2Cd25ZLKmK4AAEeOHCl1e506dRASElLheKU12Jw/fz6MjIwAQKZlYStjKkxlfFYTJ07Evn37ShRtLl68iNGjRyM6OlpU3MoYBdK2bVs8evQIqampUsvNGhoaYt68eaJiilHlq4FkZ2ejdevWWLduHRYvXgwHBwf8/PPPyMjIQMOGDfH777+jf//+AIDY2FjY2tri7NmzcHZ2xuHDh9GrVy8kJSVJKlwbNmzAjBkz8OzZszKbmLyLnbiJiKgm4Xnv/1T1dQg/CyKShbKycplfVBMTE5Gfny/35xQztUBZWRkuLi5S/108d+4cnJ2doaSkhIiICNH55OXlAUCpI0wSExNlXrlCXt73vomdrgEAlpaWOH36dKmFFVNTUzx69EhU3He9ePECDRo0kEus8p77qnwayMSJE+Hl5QVPT0+p7ZcvX8br16+ltjdr1gxmZmaSjq5nz56Fvb291FCc7t27IzMzU9IEpDR5eXnIzMyUuhEREVHN86GvQ3gNQkTyVBnTFQD5Ty0IDAwEAKxatQqRkZGIjIyEoaEhIiMjZSpUAJUzFWbPnj2S+8+fP4eXlxe0tLTQpUsXSb+JirKyssLChQullmlNSUnB999/L9P0ouJRIKUROwokICBAcv/Bgwdo0aIFjI2NYWlpKXoEiBhVOg1k165duHLlCi5evFhiX3JyMtTV1aGtrS213cDAAMnJyZJj3v0/YfHfxceUZtmyZfj+++9lzJ6IiIiqs6q4DuE1CBHJU2VMVwDkP7Vg1KhRcHd3x9ixY9G5c2fMnj1bpmVg31YZU2GWLVsGHx8fAEX9Nuzt7REYGIjff/8dU6ZMwb59+yocc9u2bZg5cyasrKwkvY1UVVXh4+OD7du3i8oTkC4svGvDhg2iYm7duhVTpkwBUNQD5KuvvsLEiROxd+9e+Pn5ISwsTFTciqqyYsWjR48wZcoUhIWFiWrSIgt/f3/4+flJ/s7MzJR5yRwiIiKqPqrqOoTXIEQkT5XxRRX4vxEbZU0tEBvz6NGjWLVqFTp37iyZviGryujZ8HasCxcu4MqVK1BRUYGfn59Uw9GKaNiwIQIDAxEYGIi0tDQAQHBwsMyrobzr2bNniI6Ohq2traQniCxu3bqFP/74AwDg7e2NRYsWyRyzvKqsWHH58mWkpKSgdevWkm2FhYU4ceIE1q5diyNHjiA/Px/p6elSv2o8ffoUhoaGAIoafFy4cEEqbnGX7uJjSvO+oUJERET08auq6xBegxBRdVBZIzaUlJQwbdo0fPrppzh58qQsKUpURmElNzcX0dHREAQBSkpKUkuLih0RUlqT0QULFsDExASCIIhuMjp8+HCsXLkS+vr6iIiIgK+vLywtLfHw4UNs2rQJffv2rXDM9PR0/P333xAEAa9fv5ba9yFbXlZZscLDw6PEfJdRo0ahWbNmmDFjBkxNTaGmpobw8HB4e3sDAG7fvo2EhAS4uLgAAFxcXLBkyRKkpKRI1n4NCwuDpqYmmjdv/mFfEBEREVUbvA4hIipbZY3YKNaiRQu0aNFC5jhA5RRWcnJy0KdPH8kX88ePH6NRo0bIyMiAsrK4to99+/Yt0WQ0IyMDq1atgpKSkuhixbVr1yTnoO+//x5hYWFwcHDAgwcP0K9fP1HFCjMzM6xatQpA0fTG4kalKSkp5V7EQh6qrFhRv3592NnZSW2rW7cudHV1JdvHjBkDPz8/6OjoQFNTE19//TVcXFzg7OwMAPjkk0/QvHlzDBs2DCtWrEBycjLmzJmDiRMn8lcLIiIiKhOvQ4iIPg6VUVh5+PBhqdvV1NSwd+9eUTEDAwPx22+/YdWqVXB0dARQtJJHZGSkqHjFcnJyJPdfvXoFBwcHSezCwkJRMf/9999St+vq6uL48eOiYopR5auBvM/q1avRq1cveHt7w9XVFYaGhvjzzz8l+1VUVHDw4EGoqKjAxcUFQ4cOxfDhw7Fw4cIqzJqIiIg+BrwOISKit9WpU0f0yh2jRo3C77//ju+++w4LFy5EYWGhXJqMdu/eHVOmTEF2djY8PT2xc+dOCIKAw4cPQ09PT+b4b1NRUUGdOnXkGvN9lIQPOelEQXGNcyIiqkl43lMc/CyIiGoWQRCwatUq7N27F/Hx8UhMTJQpXn5+PmbMmIGgoCDo6OggPj4eKioq8PDwwPr160UVV+Li4jB27FjEx8ejT58+WLZsmaQZtYuLi2QJb7HKe+5jsQK8UCAiopqF5z3Fwc+CiKhmunnzJk6ePCm31UBevXqFuLg4FBQUwNzcHDo6OqJjde/eHb1794azszMCAgIQFxeH0NBQ1K9fH46OjoiKipIp1/Ke+xR6GggRERERERHRx6ZFixZyKVTs2bMHQNEUFSMjI8yZMweWlpbo2rUrEhISRMVMSUnBxIkT4eTkhG3btsHLywseHh7IyMiQy9SV8mKxgoiIiIiIiKgaWrZsmeS+v78/7O3tcfv2bXz22WeYMmWKqJhvN+0EgFmzZmHAgAHw8PBAVlaWTPlWBIsVRERERERERNXQ210dLly4gCVLlsDQ0BB+fn64f/++qJi2trYIDQ2V2vbtt99i8ODBiIuLkynfiqiypUuJiIiIiIiISLzc3FxER0dDEAQoKSlBRUVFsk/slI1du3aVut3Pzw++vr6iYorBYgURERERERFRNZSTk4M+ffpIRlg8fvwYjRo1QkZGBpSVxU2k0NDQKHNf165dcefOHVFxK4rFCiIiIiIiIqJq6OHDh6VuV1NTw969e0XFvH79epn7PmTPChYriIiIiIiIiD4iderUgaWlpajHOjg4wMLCQqofRrHU1FRZUys3FiuIiIiIiIiICABgbm6OU6dOwdjYuMQ+U1PTD5YHVwMhIiIiIiIiIgBA7969y1xJxMvL64PlwZEVRERERERERAQACAgIKHPfhg0bPlgeHFlBRERERERERAqFxQoiIiIiIiIiUigsVhARERERERGRQmGxgoiIiIiIiIgUCosVRERERERERKRQWKwgIiIiIiIiIoXCYgURERERERERKRQWK4iIiIiIiIhIobBYQUREREREREQKhcUKIiIiIiIiIlIoLFYQERERERERkUJhsYKIiIiIiIiIFAqLFURERERERESkUFisICIiIiIiIiKFwmIFERERERERESkUFiuIiIiIiIiISKGwWEFERERERERECoXFCiIiIiIiIiJSKCxWEBEREREREZFCYbGCiIiIiIiIiBQKixVEREREREREpFBYrCAiIiIiIiIihcJiBREREREREREpFBYriIiIiIiIiEihsFhBRERERERERAqFxQoiIiIiIiIiUigsVhARERERERGRQmGxgoiIiIiIiIgUCosVRERERERERKRQWKwgIiIiIiIiIoXCYgURERERERERKRQWK4iIiIiIiIhIoVRpsWL9+vVo2bIlNDU1oampCRcXFxw+fFiyPzc3FxMnToSuri7q1asHb29vPH36VCpGQkICvLy8UKdOHejr62P69OkoKCj40C+FiIiIqhFegxARESm2Ki1WNGrUCMuXL8fly5dx6dIluLu7o0+fPrh58yYAYOrUqfj777+xZ88eHD9+HElJSejXr5/k8YWFhfDy8kJ+fj7OnDmDrVu3YsuWLZg3b15VvSQiIiKqBngNQkREpNiUBEEQqjqJt+no6ODHH39E//790bBhQ/z+++/o378/ACA2Nha2trY4e/YsnJ2dcfjwYfTq1QtJSUkwMDAAAGzYsAEzZszAs2fPoK6uXq7nzMzMhJaWFjIyMqCpqVnhnNPS0vDmzRvo6enhxYsX+Pfff9G8eXM0bdq0wrGIiIgqm6znvY9VdbwGISIiqm7Ke+5TmJ4VhYWF2LVrF16+fAkXFxdcvnwZr1+/hqenp+SYZs2awczMDGfPngUAnD17Fvb29pKLBADo3r07MjMzJb+MlCYvLw+ZmZlSN7F2794NS0tLWFlZYffu3XBzc8OmTZvQpUsX/Pnnn6LjEhER0YdRXa9BiIiIPmaqVZ1AdHQ0XFxckJubi3r16mHfvn1o3rw5rl69CnV1dWhra0sdb2BggOTkZABAcnKy1EVC8f7ifWVZtmwZvv/+e7nk/8MPPyAmJgZZWVlo06YNzpw5A3t7e8TFxWHQoEFSQ0Yr4saNG7Czs5NLjkRERFRSdb8GISIi+phV+ciKpk2b4urVqzh//jy+/PJLjBgxArdu3arU5/T390dGRobk9ujRI9GxBEGAsbExmjZtChMTE9jb2wMArKys8Pr1a9FxW7ZsiVatWmHNmjVIS0sTHYeIiIhKV92vQYiIiD5mVV6sUFdXh7W1NZycnLBs2TK0atUKAQEBMDQ0RH5+PtLT06WOf/r0KQwNDQEAhoaGJTpzF/9dfExpNDQ0JN2/i29iFRYWSu5PnDhRap8sHcFbtGiBefPmITQ0FGZmZhg4cCCOHTsmOh4RERFJq+7XIERERB+zKi9WvOvNmzfIy8uDk5MT1NTUEB4eLtl3+/ZtJCQkwMXFBQDg4uKC6OhopKSkSI4JCwuDpqYmmjdv/kHy7d27t2S+6ddffy3ZHhMTAwsLC9Fx1dTU4O3tjUOHDiEmJgZ2dnYYP348LCwssHDhQlnTJiIiondUt2sQIiKij1mVrgbi7++PHj16wMzMDFlZWfj999/xww8/4MiRI+jWrRu+/PJLHDp0CFu2bIGmpqakGHDmzBkARaMaHBwcYGxsjBUrViA5ORnDhg3D2LFjsXTp0nLnoYiduFu3bo0rV66U2B4eHo6goCDs3LmzCrIiIqKPgSKe9z40XoMQERFVjfKe+6q0wWZKSgqGDx+OJ0+eQEtLCy1btpRcJADA6tWroaysDG9vb+Tl5aF79+5Yt26d5PEqKio4ePAgvvzyS7i4uKBu3boYMWLEBx15UFmNMMta8szDwwMeHh5yfz4iIqKa5GO4BiEiIvqYVenICkUhy68aysrKsLe3x5gxYzB06FDo6OhUUpZERETywV/zFQc/CyIiqmnKe+5TuJ4V1c2HaoRZUFCAqKgoZGRkyD02ERERERERkSJhsUJGldUIMyIiArq6utDT08Px48fRoUMHDB48GFZWVjh+/LgcXwERERERERGRYmGxQo5MTU0xZ84cxMXFITAwELdv3xYdy9/fH+Hh4QgJCYG3tzeWL1+OmJgYHDp0CHPmzJFj1kRERERERESKpUobbH4MKqsRZn5+PhwcHAAA2tracHd3BwC0a9cO2dnZouMSERERERERKTqOrJDRuXPnKiXumzdvJPd9fHyk9hUWFlbKcxIREREREREpAhYrKlFqaqroxzo5OSEzMxMAsGzZMsn2uLg4dgsnIiIiIiKijxqLFZXI0dFR9GODgoJKLUqYm5sjLCxMlrSIiIiIiIiIFBp7VsjowIEDZe7Lzc2V63Nt3LgR48ePh6oqPzYiIiIiIiL6ePFbr4w+//xzuLm5QRCEEvuysrJExy2tCDJ//nwYGRkBAHr37i069ps3b6CsLD2o5sWLF2jQoIHomERERERERETywmKFjGxsbBAUFAQLC4sS+0xNTUXH7du3L1xcXKRWG8nIyMDq1auhpKQkqlhx6dIl+Pj4ICkpCT179sSmTZvQsGFDAEWrl1y5ckV0vkRERERERETywp4VMhoxYgSeP39e6r4JEyaIjhsYGAgAWLVqFSIjIxEZGQlDQ0NERkYiIiJCVMypU6di7dq1SEpKgp2dHVxdXZGYmAgApY4MEaugoABRUVHIyMiQW0wiIiIiIiKqOViskJG/vz/atGlT6r7Zs2eLjjtq1Cj8/vvv+O6777Bw4UIUFhZCSUlJdDwAyM7OhpeXF3R1dbFo0SLMnj0b7u7uePTokUyxIyIioKurCz09PRw/fhwdOnTA4MGDYWVlhePHj8uUMxEREREREdU8LFZUgo0bN8oljrm5OY4ePYq6deuic+fOyMvLkyneq1ev8ObNG8nfQ4cOxcKFC+Hh4SHTMqv+/v4IDw9HSEgIvL29sXz5csTExODQoUOYM2eOTDkTERERERFRzcOeFTKqzEaYAKCkpIRp06ahe/fuOHXqlEyxOnbsiEOHDqFXr16Sbb6+vlBSUsLQoUNFx83Pz4eDgwMAQFtbG+7u7gCAdu3aITs7W6aciYiIiIiIqOZhsUJGldEIszR2dnYwMTGRKUZQUFCp2wcMGIABAwaIjvv2aA0fHx+pfYWFhaLjfkgbNmyQqccIERERERERyQ+ngcioMhphAsDVq1fh4OCA1q1b4+bNm/Dy8oKJiQnMzMxw/fp1eaUvIcs0ECcnJ2RmZgIAli1bJtkeFxcHTU1NmXMDiqawREVFybQc7PssXbq0UuJeu3atUuISERERERF9zJQEeS4DUU1lZmZCS0sLGRkZor5cx8fHY+zYsejcuTNmz54NGxsb3L9/X6ac3NzcMHXqVKSnp2P+/PlYvHgxhg0bhv3792PdunU4evSoTPHfZWZmhoSEBLnGLCgowOvXr1G7du0KP3bGjBn44YcfABR94e/Rowc0NTWRlpaGkJAQuLq6Vjhmv379St0uCAKOHj2Kly9fVjjmf6mM95WISFaynvdIfvhZEBFRTVPecx+ngchBcSPMVatWyaURJlD0Afbt2xcAMG/ePAwbNgxA0bSTBQsWiIpZWn+NYrm5uaJiliY6OhoXL15Ey5Yty1wp5b+EhYVJihVz587FunXr0LdvX5w7dw7Tpk3D6dOnKxzzyJEj+Pnnn6Wm7ABFxYqTJ0+KyhMA1qxZU+p2QRDYs4OIiIiIiEgEFivkpLgR5qeffirTF99ibw946dq1a5n7KuLzzz+Hm5tbqY+XZXqFh4cH/vjjD+jr6yM4OBhTp05Fx44dsXDhQvj7+2P8+PGiYwNAQkKCpHDj7OyMV69eiYrj4OAAR0fHUgsoc+fOFZ3ftGnTMGTIkFKXf339+rXouERERERERDUVixVy1qJFC7Ro0QIA0KRJE9y5c0dUHAMDA2RmZkJTUxNbt26VbH/y5Alq1aolKqaNjQ2CgoJgYWFRYp+pqamomADw7Nkz6OvrAwBWr16NM2fOwNzcHGlpaejSpYuoYkVKSgrWrFkDQRBKFFLebuhZEQEBATA2Ni51X3h4uKiYAGBrawt/f380bdq0xL5jx46JjgsAoaGhMDIyQqtWrRAeHo5///0XdnZ28PX1lSkuERERERGRImOxQkbva3Ypy2iFI0eOlLq9Tp06CAkJERVzxIgReP78eanFCllWwsjLy0NhYSFUVFQgCALMzc0BADo6OqJHgXTr1g1RUVEAAFdXVzx58gRGRkZITEyUFEYq6n1TUpo0aSIqJgBMnToV+fn5pe5bvHix6LjTp0/HkSNHUFBQgOHDh2PHjh3o2bMnVq5ciWvXrsmlKeizZ88QHR0NW1tbyXK7FZWQkAB9fX3UqlULgiBg/fr1OHfuHFq1aoUpU6ZAVZX/mSEiIiIioophg03I1txKWVkZFhYWpX4pT0xMLPNLrCxkGbFRGRYsWIAbN25g+fLl2L9/P/Ly8jBkyBAcPnwYoaGh+Ouvv6o6xTJ169YNYWFhVZ1GqZo3b46rV6/i5cuXaNSoEeLj46Gnp4eXL1+iXbt2uHnzZoVjDh8+HCtXroS+vj4iIiLg6+sLS0tLPHz4EJs2bZJMt6mIli1b4uzZs6hbty7mzZuHM2fOoF+/fjh69ChMTEzwv//9r8Ixiahysamj4uBnQURENQ0bbH4g5ubmOHXqVKnTC2SZWlFZIzbKkpqaCl1dXVGPXbBgAQICAtC1a1c8ffoUBQUFWLFiBQYNGoTNmzfLlNebN2+grCy9wu6LFy/QoEGDCscqbTWQs2fPSrb/+eef4pJ8x6tXr3D79m1YW1ujfv36ouNoaGhAXV0d6urq0NbWhp6eHgCgbt26JZqElte1a9ckI1O+//57hIWFwcHBAQ8ePEC/fv1EFSsEQUDdunUBAP/88w9OnjyJOnXq4IsvvkDr1q1F5VkaeTRuJSIiIiKi6kH5vw+h9+ndu3eZy5R6eXmJjuvg4IC+ffuiT58+JW6pqami45bF0dFRpsdPmTIFjx49QmpqKlJTU5GRkYENGzZAR0dHVLxLly7B0tIStWvXxueff45nz55J9nl4eIiKeebMGejq6krex969e6N27dqSv8WaMWOG5P61a9dgbW2NQYMGwcrKCidOnBAdt0GDBli7di2WLFkCPT09/PDDD0hOTsaWLVskxYGKysnJkdx/9eoVHBwcAACWlpYoLCwUFVNJSQlPnz4FANSvX18y7UNFRQUFBQWiYgJFn3NKSgoAIDg4GJ9++ilCQ0PRv39/bNy4UXRcoPS+Jy9evJApJhERERERyQ+LFTIKCAhAp06dSt23YcMG0XGLR2w8ePCgxM3AwEBUzAMHDpR5k9fSpfXr10eDBg1k/jI5depUrF27FklJSbCzs4OrqysSExMBiF8N5fr160hNTUV0dDQGDhyIkSNHon79+hgxYgRGjBghOte3p5EUL7MaGxuLAwcOwN/fX3TcTZs2ITw8HFeuXMHff/+NFy9ewMbGBj///LPoqRXdu3fHlClTkJ2dDU9PT+zcuROCIODw4cOSkRsVNX/+fHTt2hW//fYbOnfuDG9vb2zZsgVDhgyRqWBXWuPW4OBgXLlyRfTrr4wiGBERERERyR+ngSio4hEbpU0vEfsFsLKWLj1w4ECJbfPnz5c0bOzdu3eFY2ZnZ0te56JFi9C0aVO4u7vj2LFjpS4RWh76+vr4888/ERQUBFdXV6xdu1ZUnPeR1zKrAGBtbY19+/ZJ/l6+fDmWL18uU34//fQTZsyYARMTE+jo6CA+Ph4jR46Eh4cHAgMDRcX09vZG48aNsWrVKty6dQsFBQXYu3cvBg8ejIEDB4rOtTIatxYXwZydnfHzzz/D1dUVx44dg4mJieiYpeGUFSIiIiIi2bDBJmpOc6tmzZohNDS0zKVLHz16JCqusrIyXFxcpPoonDt3Ds7OzlBSUkJERESFYzZt2hQxMTFS/Sp2796NuXPnIi8vD/Hx8aJyLfbgwQOMGTMG169fx/Pnz2WK1ahRI3z33XcQBAFr1qxBXFycZF+rVq1w7do1meK/be7cuVi0aJHMcV69eoW4uDgUFBTAzMxMdL+SylQZjVsdHR0lq8wAwI4dO7Bo0SIcO3YMffr0wZUrV0Tl6uHhgT/++AP6+voIDg7G1KlT0bFjR1y4cAH+/v6ilu9du3YtfH190bBhQ1E5Eb1PTTnvVQf8LIiIqKZhg00qobKWLg0MDMRvv/2GVatWSXpfWFpaIjIyUnTMjh074tChQ+jVq5dkm6+vL5SUlDB06FDRcYtZWloiIiICmZmZMseqjGVWAWDNmjUltq1fv17y5XXy5MmiYycnJ0t6n2RkZFRaseLgwYNSn2FFVEbj1levXkk1bR06dCjU1NTg4eGBvLw8UTGB0qesmJubIy0tDV26dBFVrJg+fTpmzJiB7t27Y+zYsejRo4foUUXlde3aNbRq1apSn4OIiIiIqDw4sgL8VUMe4uPjMXbsWHTu3BmzZ8+GjY1NmY1HFcmoUaNkXrGksqiqqsLLy0uqSWlISAj69+8PJSUlBAUFVThmTEwMRowYgUePHsHMzAxA0dQVU1NTbN68GS1atJBb/gBgZmaGhIQEmeNkZWWhoKBA1Cowbxs9ejT69etXooASHByMoUOHil5quGnTprh16xZUVFTg7OyMc+fOSfbZ29sjOjq6wjEdHR0RGhqKLVu2YPPmzcjOzsaIESMwevRoWFlZicrzv8jyecXGxkJHRwf6+vqIjY3F6dOnYWdnh/bt28s5S5IHnvcUBz8LIiKqacp77mOxArxQkBdBELBq1Srs3bsX8fHxkoaYYr18+RIaGhpQVVVFWloaoqKi0LRpUzRq1EhUPD8/vxLbgoKCMHr0aADAqlWrROdaGV/UIiIi4O/vj7lz50q+XFtaWuLBgweiY7Zv3x7fffcdvL29pbaHhIRgxYoVuHDhQoVjlva+AkX/HoKCgpCRkSEq13cVFBQgOjoajRs3hpaWllxiyktlTFlp3bq11LSUkydPIigoCCEhIWjTpo3okUuljdgBij6v77//HmlpaRWO+eOPP2LlypXQ0NDA0qVLMWvWLDg7O+P8+fPw8/PDlClTROVaGnlNharpeN5THPwsiIiopinvuY+rgdQgcXFx6Nq1Kxo3bgw/Pz+pFUBcXFxkjq+kpIRp06Zh06ZNmDt3rkyxtm3bBj09Pcl0DTs7O/j7+8PBwQG7d+8WFXPjxo149uwZtLS0JDclJSXJfbF+/PFHuLm5oU2bNtixYwc++eQTHDlyBAMGDEBAQIDouO7u7ggLC0NwcDBGjRqFzMxMmacBpKenlyhUAED//v1FFxXWrVuHevXqSb2vWlpa0NbWlinfiIgI6OrqQk9PD8ePH0eHDh0wePBgWFlZ4fjx46Ljvi06OhpBQUG4dOmSTHEWLFiAzp07o2vXrpg1axbmzp0r6VciduTOu3Xkzp07Y/PmzUhMTMTgwYNF5zpt2jRcuXIFUVFRUrerV6/i9evXomJu2bJFUqQbP348jh8/juDgYFy+fBm//vqr6FzXrFlT4rZ+/XrJfTEOHjwo05K6RERERPSBCCRkZGQIAISMjIyqTqVSffLJJ8LatWuFS5cuCcOGDRM6dOggZGZmCoIgCA4ODnJ9rrS0NJkeb29vLzx8+FC4du2aoKWlJVy8eFEQBEG4e/eu0LJlS1Exb926JXTu3Fn45ZdfJNssLCxkylMQBKF58+ZCWlqakJCQINSpU0e4f/++IAiC8OzZM6FFixYyxxcEQQgJCREcHBwEQ0NDmeJ06NBB2LZtm1BYWCjZVlhYKGzZskVwcXERFdPJyUm4fv16qfsaNWokKqYgCEK7du2EqKgoITIyUtDV1RXCw8MFQRCE8+fPC506dRIV093dXXj69KkgCIKwe/duwdjYWPDx8RHMzc2FDRs2iM71bZmZmTL/+xcEQZg4caIcsinJ3t5eiI2NLXWf2M/L0dFRct/MzExqnyz/bVFRURF69+4tjBw5UnKrV6+eMHLkSGHUqFGiYiorKwsNGzYUpk6dKty4cUN0bu+Kj48XcnJyBEEQhDdv3gj/+9//hGHDhgkrV64UXr9+LbfnkZeact6rDvhZEBFRTVPecx9HVtQgKSkpmDhxIpycnLBt2zZ4eXnBw8MDGRkZMv0CfvXqVTg4OKB169a4efMmvLy8YGJiAjMzM1y/fl1UTBUVFZibm6Nly5bQ1taWLP9obW0ttUJIRdja2iIiIgIpKSn45JNPEB8fL5eGhRoaGmjQoAFMTU0lo0EAQE9PD2pqajLHB4qWBw0NDRW9vGixrVu3YsuWLdDR0YGtrS1sbW2ho6Mj2S7GwoULUbt27TKfT6z8/Hw4ODigS5cu0NbWhru7OwCgXbt2yM7OFhWztEaYwcHBuHLlCv73v/+JzrXY1atXER4ejtOnT8s0XQdApSytCxQt31pWb47FixeLiqmhoYF//vkHO3bsgJKSkmT0U2RkJFRUVETnevToUSQnJ8Pb2xubN2/G5s2boaenh82bN4vq2QIALVu2xOHDh5Gbm4tOnTrB2dkZv/76q+h/U8V69eqFwsJCAEVLN//5559wdnbGyZMn5ToNhoiIiKjG+EDFE4VWU37VaNq0aYltP/74o+Dk5CRYW1uLjuvq6irs27dP2Lx5s2BmZiZs27ZNEARB2Ldvn9CtWzdRMVu3bi3cuHFDOHnypKCnpyecOnVKEARBiImJEezt7UXnWuzcuXOCk5OToK+vL3MsZ2dn4eDBg8L27dsFc3NzYdeuXYIgCEJERITg5OQkc/y3yevX/5SUFOHy5cvC5cuXhZSUFLnElLe3R9DMnDlTap/YfwNNmjQRCgoKBEEQhPbt20vts7OzExVTEATh2rVrgp2dnaCpqSkoKysLdnZ2QoMGDYT+/ftXyn9X/v77b7nHlMWFCxcEBwcHoXXr1sLVq1eFgQMHChoaGlIjYsTKyMgQhg0bJowcOVLIyMgQLC0tZYr39iiQnJwcYdu2bUKXLl2E+vXrix6tIQjS/35at24tvHz5UhAEQXj9+rVc/pslbzXlvFcd8LMgIqKaprznPhYrhJpzodC3b1/h8OHDJbb/9NNPgpKSkui4bw/zNjU1ldrXqlUrUTH/+ecfQUdHR9DT0xOOHTsmdOnSRWjatKmgqakpKQbI6tWrV0J0dLTMcSrri9pff/1V4mZgYCC5r0iSkpKE1atXC1OmTBGmTZsmBAUFCbm5uTLFHDVqVKn/n7x3757QsWNHUTHnz58veHt7C3fv3hV+/PFHYfHixcKDBw+EdevWCb179xadq4uLi3Dy5ElBEIo+t6+//lrIy8sTZs+eLQwfPlx03LK8+/+zioqJiZFMh4mJiRF+++034dy5c/JITeL58+dS041kJa+pUG8XK9529+5dYdasWaLj2tvbC8nJyYIgCIKbm5uQl5cnCELRlBBbW1vRcV+9eiXEx8eX2C7rFJaact6rDvhZEBFRTVPecx9XA0HN6cSdl5cHoGjI9rsSExNhYmIiKq6DgwOuXr0KABgxYoTU0P/iJoOyKiwsxNWrV2FqaioZxi/WmzdvSkwlefHihczLYr4tNTUVDRo0ED1lBQCUlZXh4uICdXV1ybZz587B2dkZSkpKiIiIkEeqEk2aNMGdO3cq/Ljg4GB89913aNWqFc6cOQNPT0+kp6cjJiYGhw8fhq2trVzzLCgowOvXr8ucevJfAgICsHLlSjx9+hQFBQWoX78+Bg0ahKVLl0otE1sRb/9/AADatm2LixcvAhD/vlbWKisfYuWO6OhoXLx4Ea1atYKTk5PM8Yo9ffoUly9fRs+ePUXH6NevH/7880+55VRs7969mDt3Lvz8/BAfH4+rV6/C29sbR48ehYmJCX788ccKxzx69CgGDBgAQRBgZWWF4OBgWFtbAyi5WkxF1ZTzXnXAz4KIiGoargZCJWhoaJRaqACArl27io5rYGCAzMxMANI9Cp48eYJatWqJjvs2FRUVODk5yVSouHTpEiwtLVG7dm18/vnnePbsmWSfh4eHPNKU0NXVhbKyMlJTU0XHKO5PsWrVKkRGRiIyMhKGhoaIjIwUXai4fv16mbesrCxRMRctWoRLly7hr7/+wvnz55Gbm4vDhw9j48aNmDRpkqiYZdm4cSNUVVVFFyoAYMqUKXj06BFSU1ORmpqKjIwMbNiwQXShAgDU1NQQGxsLoKigVLduXck+sT0bKmuVlcpYucPDwwMpKSkAiopXn376KUJDQ+Ht7Y2NGzeKzvVdKSkpSE5Olmn1lsooVABFfWV27tyJ48eP49ChQ0hISMDevXvx2WefYcWKFaJizpkzBydOnEB6ejomTpwIT09P3LhxA0DJ1WKIiIiIPjaqVZ0AfTjva3Yp9osqABw5cqTU7XXq1EFISIiomHv27IGPjw8A4Pnz5xgxYgROnToFR0dHbNu2DWZmZhWOOXXqVKxduxbOzs74+eef4erqimPHjsHExKTSLvwdHR2RkJAg6rGjRo2Cu7s7xo4di86dO2P27NkyNwR1cHCAhYVFqa9XbGFFRUUFenp6AIDGjRsjPj4eANC9e3d88803onM9cOBAiW3z58+HkZERAKB3796iYwNA/fr1ARQVQMaPHy9TrEWLFqFTp07Q09NDamqq5N99cnIyOnfuLCqmnZ0dfHx8YG9vX2Lfb7/9JjrX4oawDRo0kFtD2NIal5qbmyMtLQ1dunQR/f56eHjgjz/+gL6+PoKDgzF16lR07NgRCxcuhL+/v+i4sbGx0Pl/7d15WFRl/z/w96AgggIqIpBslgoh4oYLPS6JiYYpUbk8bpSWGl5ZyqPW19I2lzIfLUvzUcGlMrEHfXJBZdFyNwFTS1NR3EYJExAQRPj8/vDHxMgMwhlGZuD9uq65LjznzIf75p7x3POZcz5306ZwcnLSJG7atWuHbt26KYpXqmPHjli3bp1BMcoqKipC+/btAQDjxo2Dp6cnBg0ahC1btlRLcWAiIiIiU8ZkRR1ijA+qFbG3t0dAQICiS+DnzZunSVa8/fbb8PPzw6pVq/Dtt99iypQpiI2NrXLM3NxchISEALj/4bJt27bo27cv4uPjDZr46/pQXaqgoEBxXADw8PDArl27sGjRIvTs2VNzK48h8fbt2wdXV9dy+9zc3BTFdHJyQlRUFAYOHIj169ejVatWAO5/83vv3j3FbQ0NDS13G0x2djb+/e9/Q6VSKUpWGCsBMmDAAJw7dw7nz59H69atNZezOTs7Y8WKFYpiGmuVldKVO27duqVZuWPYsGEGrdxRWFiI4uJi1KtXDyICDw8PAEDTpk0NSgQaIwmi7zaYDz74wKDbYC5dugQnJydYW1tDRLBs2TIcOnQI/v7+mDJlCurXr/rptqCgAIWFhZor4oKCgrBmzRoMHjxY74ouRERERLUFkxV1iDE+qALGuWKj7AecI0eOIDk5GfXq1cPUqVMVf1DLz8/XqlcxatQoWFpaIigoyKAkwPPPP4/evXvr/FBmyBUrpVQqFaZNm4bg4GDs27fPoFiDBw9GWlqaztdAaSKnqr766iuEh4fjjTfeQJcuXTTj8+eff2LmzJmK27pq1SqsXLkSixYtQseOHQEAXl5eSEpKUhzTGAmQUg4ODmjSpImmjoC7u7smcaNERXUZSpdxVeLzzz/Ha6+9BgsLC2zZsgXz58/H2LFj0ahRI2zcuFFRzBEjRmDYsGGYP38+XnzxRXz88ccYOXIkduzYYdDfwBhJkNLbYHJzc+Ht7Y2TJ0/Cy8sLmZmZ6NOnj+JkxaBBg3Dw4EEA9xNgBw4cQFhYGHbt2oW0tDRFS+OGhYVhz549CA4O1mzr3bs31q1bh/HjxytqJxEREZG5YIFN1J3iVlOmTMFLL72Ef/zjH+X2TZw4EcuXL1cU18LCQu8VG1evXlX0DaCPjw82btwIEcGYMWO0ihc+WMywsl555RWEhYVh0KBBWts3btyIUaNGKf6m0tvbG3FxcfD09Cy3z83NDZcvX1YUV5fqLgRq6tLT07Vug2ndujXS0tIUx4uKisLKlSuxdOlSrQTIhQsXDGrnb7/9hvDwcFy5ckWT+Lt06RLc3NwQFRUFX19fRXHVajW+//57XLx4EfXr14evry/++c9/6q09o1R1FIQ1RuHSOXPm4OTJk5g/fz42b96MwsJCTRIkLi4OW7ZsqXLMsoUpPTw8NLctAfdv40hJSVHUVj8/P5w4cQIA0LlzZ/z888+wsbHBvXv30KlTpwqTujWhrpz3zAHHgoiI6ppKn/uMsBKJ2eGyYYbx9PSUq1ev6tzXsmVLRTE9PDzEy8tLPD09xdPTUy5fviwiIllZWXqXHqwpc+fOlaNHj+rc99FHHymOm5KSIv7+/tKxY0c5efKkPPvss9KwYUNxc3OT48ePK45bVkZGhiQkJMi1a9cMimPMpTBLSkpk4cKF0qNHD3F1dTU43sWLF6Vfv37y/vvvy71798TLy8vgmF27dpVNmzaV2x4TEyMBAQGKYn7//ffi4eEhgwcPFkdHRxk+fLgMGDBAPDw85LffflPc1lu3bil+bmXk5OTIX3/9VW3xFi9eLC1bthRLS0tRqVRiZ2cnEyZMkJs3byqK1717d9m6dausW7dOPDw8NEshJyYmSufOnRW301hLl5aVl5cnycnJkpOTY3AsnvdMB8eCiIjqmsqe+5isEE4UDPXGG2/Izz//rHPfhAkTqvV35eXlyYULFxQ99/z589KnTx/x8vKSt956S+7cuaPZ171792pqYfXp1auXxMbGSlRUlLi7u8vatWtFRCQ2NlaeeeYZRTFHjx6tSSokJCSIo6OjBAQESPPmzSU2NlZRzE8++UScnJzEzc1N1q1bJ25ubvLSSy+Ju7u7LF68WFFMXU6cOCHLli2rlljVnQBp06aNon0Vadeunfz5558icv+1GxoaKiIicXFx0rdvX0UxRUQsLS1l8ODBsmXLFikuLlYcpyxjJ0BEqi8JcuTIEenQoYN06tRJUlNTZfjw4dKgQQNp1qyZJCQkKI67adMm8fHxkf/85z8ya9YsGTRokERFRcmIESMkMjJSUczp06drfk5NTRUXFxdp27atNG/eXPbu3au4rSI875kSjgUREdU1ZpGsmDt3rnTp0kUaNWokzZs3lyFDhsjp06e1jrlz5468/vrr0rRpU7G1tZWwsDDNt1el0tPTNd86N2/eXCIjI6WoqKjS7eBEwby0bt1a0fP69+8vS5culV9++UVGjx4tgYGBmm8oO3TooLg9xkqClG2Tm5ub1j5/f39FMdu3b6/5uVevXpKSkiIiImlpaYr/Bk8++aT89ddfcunSJbGxsZG0tDQREfnzzz/F19dXUUx9qvMbe5HqS4AEBgbK2rVrtT78FxcXS3R0tPTo0UNRzAfHuOwVRd7e3opiitxPnixcuFB8fHzExcVFZsyYIWfOnFEcT8Q4CRB9li9fXu0xMzMzq6XdycnJMmrUKOnUqZO0b99eBg0aJN9++62UlJQoild2zJ977jlNQvHgwYMSGBhoUFt53uMchIiIqKZU9tyn/AblarB3715ERETg0KFD2L17N4qKitC/f3/k5eVpjnnrrbfw448/IiYmBnv37sW1a9cQFham2V9cXIyQkBDcvXsXBw4cwJo1axAdHY333nuvJrpE1eTXX3/V+1BatDIjIwMRERHo3Lkz1q5di5CQEAQFBSE7O9ug1UAmTZqEF198ETExMcjMzERQUJCmjYasBiJlaoA8/fTTevdVxZ07dzQ/5+fno0OHDgDu120oLi5WFLN0KUw3N7dqWwoTAFJTU9GhQwd06tQJp06dQkhICB577DG4u7tX2/3/7dq1w7BhwwyOU/r/TtOmTeHj4wMfHx80bdpUs12J0lVWrl+/joULF1bbKiu2traYNm0afvvtN2zatAmZmZno0qULevXqhbVr1yqK6eXlhV69emHmzJlo2bIlZs6cqWgVoAf973//K/eYPXu25mel8vLyNH/Dv/76C6mpqVCr1Qa3t3Tp0mPHjuH48eP48ccfMWLEiGpZZvTSpUsIDQ0FAHTv3h35+fkGx6zrOAchIiIycY8ic1JZGRkZAkBzeWtWVpZYWlpKTEyM5pjff/9dAMjBgwdFRGT79u1iYWGh9U3HsmXLxM7OTnPP8MPwWw3To1KptGpWlH1YWloqitm2bdty2z799FPp3LmzPPHEE4rb+uAVCR9//LEEBAQYXF+jf//+Ol+T165dk65duyqKOXnyZHnjjTfk9u3bMnPmTFm/fr2UlJTI9u3b5emnn1YU01g1AIxxG4yx64BkZGTIsWPH5NixY5KRkWFQrLNnz8pTTz0ljRo1kj59+kh6erqIiNy4cUNWrlypOK6u12Rubq6sXLlSnnrqKYNj7t+/X8aNGyeNGzeWnj17ypo1axS3VaVSSWBgoPTp00fzsLa2lj59+ih+va5Zs0asra2lZcuWkpCQIC4uLhIQECDNmjXTvHaV2rFjh6SmpoqISHx8vMyaNcugmI899pgsWbJEFi9eLK1atdLaV/YqKSV43iuPcxAiIqJHwyxuA3nQ2bNnBYCcOHFCRO7fUw+g3P3Q7u7usmjRIhEReffdd8tdLp2WliYAJDk5WefvKSgokOzsbM3j8uXLnCiYGGMU7QwNDZUdO3aU2/7ZZ5+JSqVSFFPEeEkQfbKysuTSpUuKnltYWChvvvmm2NnZiaenp6hUKqlfv74EBwdrbt+oqsOHD+utARAfH68opohxboMxRgKkrPPnz0tSUpIkJSXJ+fPnDY5nDIbc8qSPMRIgIiKrV6+WwMBArf/LPT09FccTuV8I8+LFi3L8+HGxt7fXFMc9e/asQQmAyMhI8fPzEx8fH5k3b574+vrKv/71L+nSpYu8/fbbimKGh4drPUoL4V65ckX69eunuK0i/ICsC+cgREREj4bZJSuKi4slJCREa2L7zTffiJWVVbljAwICNIXHXn31Venfv7/W/ry8PAEg27dv1/m7Zs+eLQDKPThRMB3GKNpZUFAgBQUFOvdduXJFUUwR4yVBKqK0bkepvLw8+fXXXyU5OVkyMzOrqVV/q44aAGU/AIwZM0Zrn9IPlcZIgIiInDp1SgICAsTZ2Vm6du0qXbt2FWdnZwkICJCTJ08qjmuMVVaUrqJREWMkQEpV9+otZdvq4eGhd19V+fj4SGFhofz1119iY2OjKY6am5srTz75pOK4xsJkhTbOQYiIiB4ds6hZUVZERAROnjyJDRs2GP13vf3228jOztY8Ll++bPTfSVWzZMkS/OMf/9C5b/ny5YpiNmjQAA0aNNC577HHHlMUEwA2bNhQrqYEAEydOtWg15Yx6naUsrGxgZ+fHzp27IhmzZoBANq0aaMoVlpaGvr27YtWrVph6tSpKCgoQLNmzWBhYYEePXoobmOLFi2Qk5MD4H5NiFJqtRrW1taKYooR6oAAwMsvv4wZM2ZArVbj8OHDOHz4MNRqNaZPn46XX35ZUcxPP/0UvXv3RpcuXbB+/Xr0798fO3fuxNChQ7FkyRLFbW3atGm5bbdu3VIcDwASEhIMen5FPDw8sGvXLtja2qJnz54oLCw0KJ6FhQVOnTqFffv2IS8vD/v37wcAnD59WnHdFuD+/y9WVlZo0qQJHBwc4OjoCOB+jRArKyvFcQsLC7F582YsXrwYS5cuRVJSkuJYpB/nIERERKanfk03AAAmT56MrVu34qeffkLLli01252dnXH37l1kZWXBwcFBs/3GjRtwdnbWHHPkyBGteDdu3NDs06WiD61EVVXRa8mQJEiHDh3g6emp80P0zZs3FcWsqDCl0gTIpEmT8MILL6B79+5YsmQJgoKCEBcXh8aNGxtUYHTnzp06t9vY2GDTpk2KYpYmQOzs7KotAQIAWVlZeOGFF8ptf/HFF/F///d/imJGR0fj9OnTyM3Nhbe3N06ePAkvLy9kZmaiT58+mDJliqK4qampCA8Ph4WFBdatW4fp06cjKSkJjo6O2Lp1K9q3b1/lmLoSINVJpVJh2rRpCA4Oxr59+wyK9eGHH6JXr16wsLDAhg0bMGvWLKjVaqjVaqxYsUJx3CZNmmDp0qXIzs6Go6MjFixYgLFjxyIuLg62traKYiYlJSE8PBwODg44c+YMevbsia+++gqNGjVCbGysQf+/0N84ByEiIjJRj+IyD31KSkokIiJCXF1d5Y8//ii3v7S41aZNmzTbTp8+rbO4Veml0iIiX3/9tdjZ2em95P9BvByWTJEx6nYYo3CpsQqMVsTQ22AeZEgdEBHjLF1a9m/n7u6utc+Q2xWMXbfjQdU9VtW9fO29e/fkl19+0TqHKHH27FkJDQ2VsLAwSU9PlxkzZkijRo3E399fU3Szqjp27Kg5Nx45ckRGjx4tIiIrVqyQIUOGGNRenvc4ByEiIqopZlGzYtKkSWJvby979uwRtVqteeTn52uOmThxori7u0tiYqL88ssv0qNHD63J/71796Rdu3bSv39/SU1Nlbi4OGnevHmVCppxokCmyBh1O4yRADFWgdHjx4/rfTg7OyuOq48hH6rPnj0rffv2FTs7O/H29pa2bduKnZ2dPP3003LmzBlFMY21yoox6nYYa6yMvXqLqXtwPDp16qT5uU2bNgbF5nmPcxAiIqKaYhbJCugoMAVAoqKiNMfcuXNHXn/9dWnSpInY2NjI888/L2q1WivOxYsXZeDAgdKwYUNxdHSUadOmSVFRUaXbwYkC1RXGSIAYq8CoMa4CMXYCpDqXLjXWKivGKFxqjLESMa+rQEpXkKhOTz31lCQmJoqISExMjAwYMECzj8kKw3EOQkREVDMqe+5TiRhQVa6WyMnJgb29PbKzs2FnZ1fTzSEyK6UFD3Xdg3316lXF99V7eXlh//79cHV1LbfPzc1NUVE6CwsLvXVArl69irt37ypqqzHqQOhy8+ZNNGnSBBYWymsjBwcHIyYmptz/dWq1GqGhoTh8+HCVYxpjrACgY8eOSElJAQC4u7vj0qVLmn0dOnRAampqlWNWVLclODgYarW6yjGB+68tPz8/jBs3DqNGjaqWOh5Hjx5FWFgYMjMz4eLigi1btsDPzw/Xr1/HihUr8N577ymOzfOe6eBYEBFRXVPZc59JFNgkIvNlrAKjgwcPRlpams4PwCEhIYpienh4YN++fXo/VCs1ZcoUzJkzB1lZWXj22Wfx0UcfYdu2bdi8eTMiIyOxa9euKsdMS0vD+PHjcfHiRYSGhmLu3LmalVt69OiBgwcPKmqrMQqXGmOsAOOs3mKMwrUA4Ovri/feew+rVq3CO++8g0GDBmH8+PHo16+f4pgBAQG4fPkybt68qRl74H7hRkMSFURERETmwGSWLiUiKssYy9eWfqjWxZAP1Tk5OQgNDUV4eDhEBKNHjwYAhIaGIiMjQ1HM0lVWYmJikJmZiaCgIM2KLYassqKPvb09goKCFD3XGGMFGGf52tKE1YULF8o9WrRoobitlpaWeOGFF7B9+3b8/vvvaNeuHSZMmABPT0988MEHimJmZWUBgFaigoiIiKiu4JUVRFRnLFmyRO8+Qz5UG+MKgIyMDERERAAA1q5di7lz5yIoKAi7d++GSqVS3FZjLF9bkTZt2uCPP/5Q9FxzugqkLDc3N8yaNQuzZs1CQkICVq9erSiOk5MTBg4ciHHjxmHQoEEG3f5DREREZG6YrCAiMlDpFQB2dnbVdgXAnTt3tP79zjvvwMrKSusKCyWMcRvEo06A2NvbIyAgQFESRFfC6tatW2jSpIlBCSsrKyud24OCghRfseLl5YVevXph5syZmDhxIsaMGYNXXnkFbdq0UdxOIiIiInPBZAURkYEqugIgJiZGUUwfHx/ExcVhwIABmm2RkZGwsLBAZGSkopiAcep2GKsOhDGSIMePH8fYsWN1FkPdtm0b/Pz8FMU9dOiQoudVxNbWFtOmTcO0adNw4MABrF69Gl26dEGHDh0wfvx4jBkzptp/JxEREZGpYLKCiMhI7O3tYW9vr+i5GzZs0Ll96tSpGDZsmOI2mVPhUmMkQd544w29xVCnTZumqBgqcL8g6rhx45Cenq4piFp6VY0hBVFLBQYGIjAwEEuWLMGGDRuwYsUKJiuIiIioVuMNsEREJqhBgwZ6V1oxZJUVcypcaoximMYohgrcL4j64osvVmtBVF1JGltbW4wbNw779u1T3FYiIiIic8BkBRERGcRYq4EYIwlijGKowN8FUTt37oy1a9ciJCQEQUFByM7OVlwQNSEhQXF7iIiIiMwdbwMhIiKTZIzVW4xRDBUwTkHUpk2b6t1nyCorREREROaAyQoiIqozjFEMFTBOQdRHvcoKERERkSlRiSHXvdYSOTk5sLe3R3Z2Nuzs7Gq6OUREZGYKCwsBQGedkatXryqqM2JhYaG3wOjVq1dx9+7dqjf0/+N5z3RwLIiIqK6p7LmPV1bg7/uUc3JyarglRERkzkqTFmU1btxY0fnFzc0NO3bsgIuLS7l9Tz75pEHnrNLn8vuKmsc5CBER1TWVnYcwWYG/L6c1ZIk9IiKi6ubt7a13n9Jlccu6fft2tcQh5TgHISKiuuph8xDeBgKgpKQE165dQ+PGjRVXbS+Vk5MDNzc3XL58uVZdzlkb+1Ub+wSwX+amNvarNvYJqF39EhHcvn0brq6usLDgwmA1iXOQh2O/zEdt7BPAfpmT2tgnoPb1q7LzEF5Zgfv3Bbds2bJaY9rZ2dWKF9KDamO/amOfAPbL3NTGftXGPgG1p1+8osI0cA5SeeyX+aiNfQLYL3NSG/sE1K5+VWYewq9TiIiIiIiIiMikMFlBRERERERERCaFyYpq1qBBA8yePVvn8nXmrDb2qzb2CWC/zE1t7Fdt7BNQe/tFtUdtfY2yX+ajNvYJYL/MSW3sE1B7+/UwLLBJRERERERERCaFV1YQERERERERkUlhsoKIiIiIiIiITAqTFURERERERERkUpisICIiIiIiIiKTwmSFAl9++SU8PT1hbW2Nbt264ciRIxUeHxMTA29vb1hbW8PPzw/bt29/RC2tnHnz5iEgIACNGzeGk5MTQkNDcebMmQqfEx0dDZVKpfWwtrZ+RC2unDlz5pRro7e3d4XPMfWx8vT0LNcnlUqFiIgInceb6jj99NNPeO655+Dq6gqVSoXNmzdr7RcRvPfee3BxcUHDhg3Rr18/nD179qFxq/rerG4V9auoqAgzZsyAn58fbG1t4erqijFjxuDatWsVxlTyOq5ODxur8PDwcu0bMGDAQ+Oa8lgB0Pk+U6lU+PTTT/XGrOmxorqBcxDTPbeVxTmI6Y4T5yDmMwcBauc8hHOQymOyooq+//57TJ06FbNnz0ZycjL8/f0RHByMjIwMnccfOHAAI0aMwLhx45CSkoLQ0FCEhobi5MmTj7jl+u3duxcRERE4dOgQdu/ejaKiIvTv3x95eXkVPs/Ozg5qtVrzSE9Pf0QtrjxfX1+tNu7bt0/vseYwVkePHtXqz+7duwEAL730kt7nmOI45eXlwd/fH19++aXO/Z988gk+//xzLF++HIcPH4atrS2Cg4NRUFCgN2ZV35vGUFG/8vPzkZycjHfffRfJycn473//izNnzmDw4MEPjVuV13F1e9hYAcCAAQO02vfdd99VGNPUxwqAVn/UajVWr14NlUqFF154ocK4NTlWVPtxDvI3Uzy3PYhzENMcJ85BzGcOAtTOeQjnIFUgVCVdu3aViIgIzb+Li4vF1dVV5s2bp/P4oUOHSkhIiNa2bt26yYQJE4zaTkNkZGQIANm7d6/eY6KiosTe3v7RNUqB2bNni7+/f6WPN8exmjJlijz++ONSUlKic785jBMAiY2N1fy7pKREnJ2d5dNPP9Vsy8rKkgYNGsh3332nN05V35vG9mC/dDly5IgAkPT0dL3HVPV1bEy6+jR27FgZMmRIleKY41gNGTJE+vbtW+ExpjRWVDtxDnKfOZzbOAcxj3HiHMR85iAitXMewjlIxXhlRRXcvXsXx44dQ79+/TTbLCws0K9fPxw8eFDncw4ePKh1PAAEBwfrPd4UZGdnAwCaNm1a4XG5ubnw8PCAm5sbhgwZglOnTj2K5lXJ2bNn4erqilatWmHkyJG4dOmS3mPNbazu3r2L9evX45VXXoFKpdJ7nDmMU1kXLlzA9evXtcbC3t4e3bp10zsWSt6bpiA7OxsqlQoODg4VHleV13FN2LNnD5ycnNC2bVtMmjQJN2/e1HusOY7VjRs3sG3bNowbN+6hx5r6WJH54hxEmzmc2zgHMY9xKotzkPLM4bxWm+chdX0OwmRFFWRmZqK4uBgtWrTQ2t6iRQtcv35d53OuX79epeNrWklJCd5880089dRTaNeund7j2rZti9WrV2PLli1Yv349SkpKEBgYiCtXrjzC1lasW7duiI6ORlxcHJYtW4YLFy6gZ8+euH37ts7jzW2sNm/ejKysLISHh+s9xhzG6UGlf++qjIWS92ZNKygowIwZMzBixAjY2dnpPa6qr+NHbcCAAVi7di0SEhKwYMEC7N27FwMHDkRxcbHO481xrNasWYPGjRsjLCyswuNMfazIvHEO8jdzOLdxDmIe4/QgzkG0mcN5rbbPQ+r6HKR+TTeATEtERAROnjz50HucevTogR49emj+HRgYCB8fH3z99df48MMPjd3MShk4cKDm5/bt26Nbt27w8PDAxo0bK5WdNHWrVq3CwIED4erqqvcYcxinuqioqAhDhw6FiGDZsmUVHmvqr+Phw4drfvbz80P79u3x+OOPY8+ePQgKCqrBllWf1atXY+TIkQ8tDGfqY0Vk6jgHMR+cg5iv2jQHAWr/PKSuz0F4ZUUVODo6ol69erhx44bW9hs3bsDZ2Vnnc5ydnat0fE2aPHkytm7diqSkJLRs2bJKz7W0tETHjh1x7tw5I7XOcA4ODmjTpo3eNprTWKWnpyM+Ph7jx4+v0vPMYZxK/95VGQsl782aUjpJSE9Px+7duyv8RkOXh72Oa1qrVq3g6Oiot33mNFYA8PPPP+PMmTNVfq8Bpj9WZF44B9HPHM5tnIOYxzhxDlIxcziv1aZ5COcgTFZUiZWVFTp37oyEhATNtpKSEiQkJGhljsvq0aOH1vEAsHv3br3H1wQRweTJkxEbG4vExER4eXlVOUZxcTFOnDgBFxcXI7SweuTm5uL8+fN622gOY1UqKioKTk5OCAkJqdLzzGGcvLy84OzsrDUWOTk5OHz4sN6xUPLerAmlk4SzZ88iPj4ezZo1q3KMh72Oa9qVK1dw8+ZNve0zl7EqtWrVKnTu3Bn+/v5Vfq6pjxWZF85B9DOHcxvnIOYxTpyDVMwczmu1aR7COQi4GkhVbdiwQRo0aCDR0dHy22+/yWuvvSYODg5y/fp1EREZPXq0zJw5U3P8/v37pX79+rJw4UL5/fffZfbs2WJpaSknTpyoqS6UM2nSJLG3t5c9e/aIWq3WPPLz8zXHPNiv999/X3bu3Cnnz5+XY8eOyfDhw8Xa2lpOnTpVE13Qadq0abJnzx65cOGC7N+/X/r16yeOjo6SkZEhIuY5ViL3Kxa7u7vLjBkzyu0zl3G6ffu2pKSkSEpKigCQRYsWSUpKiqYi9fz588XBwUG2bNkiv/76qwwZMkS8vLzkzp07mhh9+/aVL774QvPvh703a7pfd+/elcGDB0vLli0lNTVV671WWFiot18Pex3XZJ9u374tkZGRcvDgQblw4YLEx8dLp06dpHXr1lJQUKC3T6Y+VqWys7PFxsZGli1bpjOGqY0V1X6cg9xnque2sjgHMd1x4hzEfOYgD+uXuc5DOAepPCYrFPjiiy/E3d1drKyspGvXrnLo0CHNvt69e8vYsWO1jt+4caO0adNGrKysxNfXV7Zt2/aIW1wxADofUVFRmmMe7Nebb76p+Ru0aNFCnn32WUlOTn70ja/AsGHDxMXFRaysrOSxxx6TYcOGyblz5zT7zXGsRER27twpAOTMmTPl9pnLOCUlJel8zZW2vaSkRN59911p0aKFNGjQQIKCgsr118PDQ2bPnq21raL35qNQUb8uXLig972WlJSkt18Pex3XZJ/y8/Olf//+0rx5c7G0tBQPDw959dVXy53szW2sSn399dfSsGFDycrK0hnD1MaK6gbOQUz33FYW5yCmO06cg5jPHORh/TLXeQjnIJWnEhFRelUGEREREREREVF1Y80KIiIiIiIiIjIpTFYQERERERERkUlhsoKIiIiIiIiITAqTFURERERERERkUpisICIiIiIiIiKTwmQFEREREREREZkUJiuIiIiIiIiIyKQwWUFEj1x4eDhCQ0P17p8zZw46dOjwyNpDREREdQPnIETmg8kKIjI5kZGRSEhIqOlmEBERUR3DOQiR6ahf0w0gIvNx9+5dWFlZGf33NGrUCI0aNTL67yEiIiLzwDkIUd3DKyuISK8+ffpg8uTJePPNN+Ho6Ijg4GAsWrQIfn5+sLW1hZubG15//XXk5uZqnhMdHQ0HBwfs3LkTPj4+aNSoEQYMGAC1Wq339xw9ehTNmzfHggULAJS/BLP0ks2FCxfCxcUFzZo1Q0REBIqKijTHqNVqhISEoGHDhvDy8sK3334LT09PLF68uNr/LkRERGRcnIMQEZMVRFShNWvWwMrKCvv378fy5cthYWGBzz//HKdOncKaNWuQmJiI6dOnaz0nPz8fCxcuxLp16/DTTz/h0qVLiIyM1Bk/MTERzzzzDD7++GPMmDFDbzuSkpJw/vx5JCUlYc2aNYiOjkZ0dLRm/5gxY3Dt2jXs2bMHP/zwA1asWIGMjIxq+RsQERHRo8c5CFHdxttAiKhCrVu3xieffKL5d9u2bTU/e3p64qOPPsLEiRPx1VdfabYXFRVh+fLlePzxxwEAkydPxgcffFAudmxsLMaMGYOVK1di2LBhFbajSZMmWLp0KerVqwdvb2+EhIQgISEBr776Kk6fPo34+HgcPXoUXbp0AQCsXLkSrVu3NqjvREREVHM4ByGq25isIKIKde7cWevf8fHxmDdvHk6fPo2cnBzcu3cPBQUFyM/Ph42NDQDAxsZGM0kAABcXl3LfMBw+fBhbt27Fpk2bKqzKXcrX1xf16tXTinnixAkAwJkzZ1C/fn106tRJs/+JJ55AkyZNqtxfIiIiMg2cgxDVbbwNhIgqZGtrq/n54sWLGDRoENq3b48ffvgBx44dw5dffgngfuGrUpaWlloxVCoVRERr2+OPPw5vb2+sXr1a675PfXTFLCkpqXJ/iIiIyDxwDkJUtzFZQUSVduzYMZSUlOCzzz5D9+7d0aZNG1y7dk1RLEdHRyQmJuLcuXMYOnRopSYL+rRt2xb37t1DSkqKZtu5c+dw69YtxTGJiIjIdHAOQlT3MFlBRJX2xBNPoKioCF988QXS0tKwbt06LF++XHE8JycnJCYm4vTp0xgxYgTu3bunKI63tzf69euH1157DUeOHEFKSgpee+01NGzYECqVSnH7iIiIyDRwDkJU9zBZQUSV5u/vj0WLFmHBggVo164dvvnmG8ybN8+gmM7OzkhMTMSJEycwcuRIFBcXK4qzdu1atGjRAr169cLzzz+PV199FY0bN4a1tbVB7SMiIqKaxzkIUd2jkgdv4iIiqgWuXLkCNzc3xMfHIygoqKabQ0RERHUE5yBE1YPJCiKqFRITE5Gbmws/Pz+o1WpMnz4dV69exR9//FGuMBYRERFRdeEchMg4uHQpEdUKRUVFeOedd5CWlobGjRsjMDAQ33zzDScJREREZFScgxAZB6+sICIiIiIiIiKTwgKbRERERERERGRSmKwgIiIiIiIiIpPCZAURERERERERmRQmK4iIiIiIiIjIpDBZQUREREREREQmhckKIiIiIiIiIjIpTFYQERERERERkUlhsoKIiIiIiIiITAqTFURERERERERkUv4frJL0ZU2EhckAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import scanpy\n", + "\n", + "# ScanPy `rank_genes_groups` assumes that the X data is logged.\n", + "scanpy.pp.log1p(adata.X, copy=False)\n", + "\n", + "# do ranking\n", + "scanpy.tl.rank_genes_groups(adata, \"cell_type_ontology_term_id\", method=\"t-test\")\n", + "\n", + "# visualize ranking\n", + "scanpy.pl.rank_genes_groups(adata)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/python/notebooks/census_summary_cell_counts.ipynb b/api/python/notebooks/census_summary_cell_counts.ipynb new file mode 100644 index 000000000..f651a5347 --- /dev/null +++ b/api/python/notebooks/census_summary_cell_counts.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Census summary cell counts example\n", + "\n", + "*Goal:* demonstrate basic use of the `census_summary_cell_counts` dataframe.\n", + "\n", + "Each Cell Census contains a top-level dataframe summarizing counts of various cell labels. You can read this into a Pandas DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
organismcategoryontology_term_idunique_cell_counttotal_cell_countlabel
0Homo sapiensallna2204498034115852na
1Homo sapiensassayEFO:0008722177719260396Drop-seq
2Homo sapiensassayEFO:0008780051304inDrop
3Homo sapiensassayEFO:0008913133511133511single-cell RNA sequencing
4Homo sapiensassayEFO:000891944721161998Seq-Well
.....................
1147Mus musculustissue_generalUBERON:0002113164881188361kidney
1148Mus musculustissue_generalUBERON:00023651557731154exocrine gland
1149Mus musculustissue_generalUBERON:000236737715130135prostate gland
1150Mus musculustissue_generalUBERON:00023681332226644endocrine gland
1151Mus musculustissue_generalUBERON:000237154737109474bone marrow
\n", + "

1152 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " organism category ontology_term_id unique_cell_count \\\n", + "0 Homo sapiens all na 22044980 \n", + "1 Homo sapiens assay EFO:0008722 177719 \n", + "2 Homo sapiens assay EFO:0008780 0 \n", + "3 Homo sapiens assay EFO:0008913 133511 \n", + "4 Homo sapiens assay EFO:0008919 44721 \n", + "... ... ... ... ... \n", + "1147 Mus musculus tissue_general UBERON:0002113 164881 \n", + "1148 Mus musculus tissue_general UBERON:0002365 15577 \n", + "1149 Mus musculus tissue_general UBERON:0002367 37715 \n", + "1150 Mus musculus tissue_general UBERON:0002368 13322 \n", + "1151 Mus musculus tissue_general UBERON:0002371 54737 \n", + "\n", + " total_cell_count label \n", + "0 34115852 na \n", + "1 260396 Drop-seq \n", + "2 51304 inDrop \n", + "3 133511 single-cell RNA sequencing \n", + "4 161998 Seq-Well \n", + "... ... ... \n", + "1147 188361 kidney \n", + "1148 31154 exocrine gland \n", + "1149 130135 prostate gland \n", + "1150 26644 endocrine gland \n", + "1151 109474 bone marrow \n", + "\n", + "[1152 rows x 6 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import cell_census\n", + "\n", + "census = cell_census.open_soma()\n", + "census_summary_cell_counts = census[\"census_info\"][\"summary_cell_counts\"].read_as_pandas_all()\n", + "\n", + "# Dropping the soma_joinid column as it isn't useful in this demo\n", + "census_summary_cell_counts = census_summary_cell_counts.drop(columns=[\"soma_joinid\"])\n", + "\n", + "census_summary_cell_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dataframe is precomputed from the experiments in the Cell Census, and is intended to simplify quick looks at the Census contents.\n", + "\n", + "You can do similar group statistics using Pandas `groupby` functions. \n", + "\n", + "The code below reproduces the above counts using full `obs` dataframe in the `Homo_sapiens` experiment.\n", + "\n", + "Keep in mind that the Cell Census is very large, and any queries will return significant amount of data. You can manage that by narrowing the query request using `column_names` and `value_filter` in your query." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cell_type_ontology_term_idcell_typesize
0CL:0000001primary cultured cell80
1CL:0000003native cell611233
2CL:0000006neuronal receptor cell2502
3CL:0000019sperm11
4CL:0000031neuroblast (sensu Vertebrata)2355
............
540CL:4023041L5 extratelencephalic projecting glutamatergic...2361
541CL:4023051vascular leptomeningeal cell3937
542CL:4023070caudal ganglionic eminence derived GABAergic c...8463
543CL:4028002alveolar capillary type 1 endothelial cell16048
544CL:4028003alveolar capillary type 2 endothelial cell7157
\n", + "

545 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " cell_type_ontology_term_id \\\n", + "0 CL:0000001 \n", + "1 CL:0000003 \n", + "2 CL:0000006 \n", + "3 CL:0000019 \n", + "4 CL:0000031 \n", + ".. ... \n", + "540 CL:4023041 \n", + "541 CL:4023051 \n", + "542 CL:4023070 \n", + "543 CL:4028002 \n", + "544 CL:4028003 \n", + "\n", + " cell_type size \n", + "0 primary cultured cell 80 \n", + "1 native cell 611233 \n", + "2 neuronal receptor cell 2502 \n", + "3 sperm 11 \n", + "4 neuroblast (sensu Vertebrata) 2355 \n", + ".. ... ... \n", + "540 L5 extratelencephalic projecting glutamatergic... 2361 \n", + "541 vascular leptomeningeal cell 3937 \n", + "542 caudal ganglionic eminence derived GABAergic c... 8463 \n", + "543 alveolar capillary type 1 endothelial cell 16048 \n", + "544 alveolar capillary type 2 endothelial cell 7157 \n", + "\n", + "[545 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "human = census[\"census_data\"][\"homo_sapiens\"]\n", + "obs_df = human.obs.read_as_pandas_all(column_names=[\"cell_type_ontology_term_id\", \"cell_type\"])\n", + "obs_df.groupby(by=[\"cell_type_ontology_term_id\", \"cell_type\"], as_index=False, observed=True).size()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3da8ec1c162cd849e59e6ea2824b2e353dce799884e910aae99411be5277f953" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/cell_census_builder/REAMDE.md b/cell_census_builder/REAMDE.md new file mode 100644 index 000000000..a32e8f987 --- /dev/null +++ b/cell_census_builder/REAMDE.md @@ -0,0 +1,61 @@ +# README + +This is a tool to build the SOMA instantiation of the Cell Census schema, as specified in this doc: + +https://docs.google.com/document/d/1GKndzCk9q_1SdYOq3BeCxWgp-o2NSQkEmSBaBPKnNI8/ + +CAVEATS (READ THIS): + +1. The code is written to the still-rapidly-evolving and **pre-release** Python SOMA API, _and will be subject to change_ as the SOMA API and `tiledbsoma` evolve and stabilize. +2. The schema implemented by this code is still evolving and subject to change. + +## Usage + +TL;DR: + +- given a set of H5AD files, which comply with cellxgene 3.0 schema, +- create several SOMAExperiment aggregations representing mouse & human slices of the entire collection, and +- embed experiments into a single SOMACollection, along with other metadata about the aggregation/census + +The build process: + +- Pass 1: stage all source H5AD files +- Pass 2: build the axis dataframes for each experiment. This is a single-threaded pass, building dense dataframes. +- Pass 3: build the X layers for each experiment. This is a concurrent pass, reading/writing X layers in parallel. +- Pass 4: optional, validate the above + +Modes of operation: +a) (default) creating the entire "cell census" using all files currently in the CELLxGENE repository. +b) creating a smaller "cell census" from a user-provided list of files (a "manifest") + +### Mode (a) - creating the full cell census from the entire CELLxGENE (public) corpus: + +- On a large-memory machine with _ample_ free (local) disk (eg, 3/4 TB or more) and swap (1 TB or more) +- To create an cell census at ``, execute: + > $ python -m cell_census_builder -mp --max-workers 12 + +If you run out of memory, reduce `--max-workers`. You can also try a higher number if you have lots of CPU & memory. + +### Mode (b) - creating a cell census from a user-provided list of H5AD files: + +- Create a manifest file, in CSV format, containing two columns: dataset_id, h5ad_uri. Example: + ```csv + 53d208b0-2cfd-4366-9866-c3c6114081bc, /files/53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad + 559ed814-a9c9-4b77-a0e6-7da7b907fe3a, /files/559ed814-a9c9-4b77-a0e6-7da7b907fe3a.h5ad + 5b93b8fc-7c9a-45bd-ad3f-dc883137de30, /files/5b93b8fc-7c9a-45bd-ad3f-dc883137de30.h5ad + ``` + You can specify a file system path or a URI in the second field +- To create an cell census at ``, execute: + > $ python -m cell_census_builder --manifest + +### Other info + +There are more options discoverable via the `--help` command line option. + +Note on required host resources: + +- all H5AD files not on the local disk will be downloaded/cached locally. There must be + suffiicent local file system space. Location of cache can be controlled with the + environment variable `FSSPEC_CACHE_DIR` +- each H5AD will be read into memory, in its entirety. Sufficient RAM must be present to + allow for this (and to do so for multiple H5ADs concurrently if you use the `--multi-process` option) diff --git a/cell_census_builder/__init__.py b/cell_census_builder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cell_census_builder/__main__.py b/cell_census_builder/__main__.py new file mode 100644 index 000000000..9c040d40b --- /dev/null +++ b/cell_census_builder/__main__.py @@ -0,0 +1,315 @@ +import argparse +import gc +import logging +import multiprocessing +import os.path +import sys +from datetime import datetime, timezone +from typing import List, Tuple + +import tiledbsoma as soma + +from .anndata import open_anndata +from .census_summary import create_census_summary +from .consolidate import consolidate +from .datasets import Dataset, assign_soma_joinids, create_dataset_manifest +from .experiment_builder import ExperimentBuilder, populate_X_layers +from .globals import CENSUS_SCHEMA_VERSION, CXG_SCHEMA_VERSION, RNA_SEQ, TileDB_Ctx +from .manifest import load_manifest +from .mp import process_initializer +from .source_assets import stage_source_assets +from .summary_cell_counts import create_census_summary_cell_counts +from .util import uricat +from .validate import validate + + +def make_experiment_builders(base_uri: str, args: argparse.Namespace) -> List[ExperimentBuilder]: + """ + Define all soma.Experiments to build in the census. + + Functionally, this defines per-experiment name, anndata filter, etc. + It also loads any required per-Experiment assets. + """ + GENE_LENGTH_BASE_URI = ( + "https://raw.githubusercontent.com/chanzuckerberg/single-cell-curation/" + "100f935eac932e1f5f5dadac0627204da3790f6f/cellxgene_schema_cli/cellxgene_schema/ontology_files/" + ) + GENE_LENGTH_URIS = [ + GENE_LENGTH_BASE_URI + "genes_homo_sapiens.csv.gz", + GENE_LENGTH_BASE_URI + "genes_mus_musculus.csv.gz", + GENE_LENGTH_BASE_URI + "genes_sars_cov_2.csv.gz", + ] + experiment_builders = [ # The soma.Experiments we want to build + ExperimentBuilder( + base_uri=base_uri, + name="homo_sapiens", + anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:9606", assay_ontology_term_ids=RNA_SEQ), + gene_feature_length_uris=GENE_LENGTH_URIS, + ), + ExperimentBuilder( + base_uri=base_uri, + name="mus_musculus", + anndata_cell_filter_spec=dict(organism_ontology_term_id="NCBITaxon:10090", assay_ontology_term_ids=RNA_SEQ), + gene_feature_length_uris=GENE_LENGTH_URIS, + ), + ] + + return experiment_builders + + +def main() -> int: + parser = create_args_parser() + args = parser.parse_args() + assert args.subcommand in ["build", "validate"] + + process_initializer(args.verbose) + + # normalize our base URI - must include trailing slash + args.uri = args.uri if args.uri.endswith("/") else args.uri + "/" + soma_path = uricat(args.uri, args.build_tag, "soma") + assets_path = uricat(args.uri, args.build_tag, "h5ads") + + # create the experiment builders + experiment_builders = make_experiment_builders(uricat(soma_path, "census_data"), args) + + cc = 0 + if args.subcommand == "build": + cc = build(args, soma_path, assets_path, experiment_builders) + + # sanity check for build completion + assert cc != 0 or all(e.is_finished() for e in experiment_builders) + + if cc == 0 and (args.subcommand == "validate" or args.validate): + cc = validate(args, experiment_builders) + + return cc + + +def build( + args: argparse.Namespace, soma_path: str, assets_path: str, experiment_builders: List[ExperimentBuilder] +) -> int: + """ + Approximately, build steps are: + 1. Download manifest and copy/stage all source assets + 2. Read all H5AD and create axis dataframe (serial) + * write obs/var dataframes + * accumulate overall shape of X + 3. Read all H5AD assets again, write X layer (parallel) + 4. Optional: validate + + Returns + ------- + int + Process completion code, 0 on success, non-zero indicating error, + suitable for providing to sys.exit() + """ + + # Don't clobber an existing census build + if os.path.exists(soma_path) or os.path.exists(assets_path): + logging.error("Census build path already exists - aborting build") + return 1 + + # Create top-level build directories + os.makedirs(soma_path, exist_ok=False) + os.makedirs(assets_path, exist_ok=False) + + # Step 1 - get all source assets + datasets = build_step1_get_source_assets(args, assets_path) + + # Step 2 - build axis dataframes + top_level_collection, filtered_datasets = build_step2_create_axis( + soma_path, assets_path, datasets, experiment_builders, args + ) + assign_soma_joinids(filtered_datasets) + logging.info(f"({len(filtered_datasets)} of {len(datasets)}) suitable for processing.") + gc.collect() + + # Step 3- create X layers + build_step3_create_X_layers(assets_path, filtered_datasets, experiment_builders, args) + gc.collect() + + # Write out dataset manifest and summary information + create_dataset_manifest(top_level_collection["census_info"], filtered_datasets) + create_census_summary_cell_counts( + top_level_collection["census_info"], [e.census_summary_cell_counts for e in experiment_builders] + ) + create_census_summary(top_level_collection["census_info"], experiment_builders, args.build_tag) + + if args.consolidate: + consolidate(top_level_collection.uri) + + return 0 + + +def create_top_level_collections(soma_path: str) -> soma.Collection: + """ + Create the top-level SOMA collections for the Census. + + Returns the top-most collection. + """ + top_level_collection = soma.Collection(soma_path, ctx=TileDB_Ctx()) + if top_level_collection.exists(): + logging.error("Census already exists - aborting") + raise Exception("Census already exists - aborting") + + top_level_collection.create() + # Set top-level metadata for the experiment + top_level_collection.metadata["created_on"] = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") + top_level_collection.metadata["cxg_schema_version"] = CXG_SCHEMA_VERSION + top_level_collection.metadata["census_schema_version"] = CENSUS_SCHEMA_VERSION + + # Create sub-collections for experiments, etc. + for n in ["census_info", "census_data"]: + cltn = soma.Collection(uricat(top_level_collection.uri, n), ctx=TileDB_Ctx()).create() + top_level_collection.set(n, cltn, relative=True) + + return top_level_collection + + +def build_step1_get_source_assets(args: argparse.Namespace, assets_path: str) -> List[Dataset]: + logging.info("Build step 1 - get source assets - started") + + # Load manifest defining the datasets + datasets = load_manifest(args.manifest) + if len(datasets) == 0: + logging.error("No H5AD files in the manifest (or we can't find the files)") + raise AssertionError("No H5AD files in the manifest (or we can't find the files)") + + # Testing/debugging hook - hidden option + if args.test_first_n is not None and args.test_first_n > 0: + # Process the N smallest datasets + datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize)[0 : args.test_first_n] + + # Stage all files + stage_source_assets(datasets, args, assets_path) + + logging.info("Build step 1 - get source assets - finished") + return datasets + + +def build_step2_create_axis( + soma_path: str, + assets_path: str, + datasets: List[Dataset], + experiment_builders: List[ExperimentBuilder], + args: argparse.Namespace, +) -> Tuple[soma.Collection, List[Dataset]]: + """ + Create all objects, and populate the axis dataframes. + + Returns: the filtered datasets that will be included. This is simply + an optimization to allow subsequent X matrix writing to skip unused + datasets. + """ + logging.info("Build step 2 - axis creation - started") + + top_level_collection = create_top_level_collections(soma_path) + + # Create axis + for e in experiment_builders: + e.create(data_collection=top_level_collection["census_data"]) + assert soma.Experiment(e.se_uri).exists() + + # Write obs axis and accumulate var axis (and remember the datasets that pass our filter) + filtered_datasets = [] + N = len(datasets) * len(experiment_builders) + n = 1 + for (dataset, ad) in open_anndata(assets_path, datasets, backed="r"): + dataset_total_cell_count = 0 + for e in experiment_builders: + dataset_total_cell_count += e.accumulate_axes(dataset, ad, progress=(n, N)) + n += 1 + + dataset.dataset_total_cell_count = dataset_total_cell_count + if dataset_total_cell_count > 0: + filtered_datasets.append(dataset) + + # Commit / write var + for e in experiment_builders: + e.commit_axis() + logging.info(f"Experiment {e.name} will contain {e.n_obs} cells from {e.n_datasets} datasets") + + logging.info("Build step 2 - axis creation - finished") + return top_level_collection, filtered_datasets + + +def build_step3_create_X_layers( + assets_path: str, + filtered_datasets: List[Dataset], + experiment_builders: List[ExperimentBuilder], + args: argparse.Namespace, +) -> None: + """ + Create and populate all X layers + """ + logging.info("Build step 3 - X layer creation - started") + # base_path = args.uri + + # Create X layers + for e in experiment_builders: + e.create_X_layers(filtered_datasets) + e.create_joinid_metadata() + + # Process all X data + populate_X_layers(assets_path, filtered_datasets, experiment_builders, args) + + # tidy up and finish + for e in experiment_builders: + e.commit_X(consolidate=args.consolidate) + e.commit_presence_matrix(filtered_datasets) + + logging.info("Build step 3 - X layer creation - finished") + + +def create_args_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="cell_census_builder") + parser.add_argument("uri", type=str, help="Census top-level URI") + parser.add_argument("-v", "--verbose", action="count", default=0, help="Increase logging verbosity") + parser.add_argument( + "-mp", + "--multi-process", + action=argparse.BooleanOptionalAction, + default=False, + help="Use multiple processes", + ) + parser.add_argument("--max-workers", type=int, help="Concurrency") + parser.add_argument( + "--build-tag", + type=str, + default=datetime.now().astimezone().date().isoformat(), + help="Census build tag (default: current date is ISO8601 format)", + ) + + subparsers = parser.add_subparsers(required=True, dest="subcommand") + + # BUILD + build_parser = subparsers.add_parser("build", help="Build Cell Census") + build_parser.add_argument( + "--manifest", + type=argparse.FileType("r"), + help="Manifest file", + ) + build_parser.add_argument( + "--validate", action=argparse.BooleanOptionalAction, default=True, help="Validate immediately after build" + ) + build_parser.add_argument( + "--consolidate", + action=argparse.BooleanOptionalAction, + default=True, + help="Consolidate TileDB objects after build", + ) + # hidden option for testing. Will process only the first 'n' datasets + build_parser.add_argument("--test-first-n", type=int, help=argparse.SUPPRESS) + + # VALIDATE + subparsers.add_parser("validate", help="Validate an existing cell census build") + + return parser + + +if __name__ == "__main__": + # this is very important to do early, before any use of `concurrent.futures` + if multiprocessing.get_start_method(True) != "spawn": + multiprocessing.set_start_method("spawn", True) + + sys.exit(main()) diff --git a/cell_census_builder/anndata.py b/cell_census_builder/anndata.py new file mode 100644 index 000000000..835d486f7 --- /dev/null +++ b/cell_census_builder/anndata.py @@ -0,0 +1,164 @@ +import logging +from typing import Any, Iterator, List, Optional, Protocol, TypedDict, Union + +import anndata +import numpy as np +import pandas as pd + +from .datasets import Dataset +from .globals import CXG_SCHEMA_VERSION, CXG_SCHEMA_VERSION_IMPORT, FEATURE_REFERENCE_IGNORE, RNA_SEQ +from .util import uricat + +AnnDataFilterSpec = TypedDict( + "AnnDataFilterSpec", + { + "organism_ontology_term_id": Optional[str], + "assay_ontology_term_ids": Optional[List[str]], + }, +) + + +def open_anndata( + base_path: str, datasets: Union[List[Dataset], Dataset], *args: Any, **kwargs: Any +) -> Iterator[tuple[Dataset, anndata.AnnData]]: + """ + Generator to open anndata in a given mode, and filter out those H5ADs which do not match our base + criteria for inclusion in the census. + + Will localize non-local (eg s3) URIs to accomadate AnnData/H5PY requirement for a local file. + + Apply criteria to filter out H5ADs we don't want or can't process. Also apply a set of normalization + remainder of code expects, such as final/raw feature equivalence. + """ + if not isinstance(datasets, list): + datasets = [datasets] + + for h5ad in datasets: + path = uricat(base_path, h5ad.dataset_h5ad_path) + logging.debug(f"open_anndata: {path}") + ad = anndata.read_h5ad(path, *args, **kwargs) + + assert CXG_SCHEMA_VERSION == "3.0.0" + if h5ad.schema_version == "": + h5ad.schema_version = get_cellxgene_schema_version(ad) + if h5ad.schema_version not in CXG_SCHEMA_VERSION_IMPORT: + logging.error(f"H5AD has old schema version, skipping {h5ad.dataset_h5ad_path}") + continue + + # Multi-organism datasets - any dataset with 2+ feature_reference organisms is ignored, + # exclusive of values in FEATURE_REFERENCE_IGNORE. See also, cell filter for mismatched + # cell/feature organism values. + feature_reference_organisms = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE + if len(feature_reference_organisms) > 1: + logging.info(f"H5AD ignored due to multi-organism feature_reference: {h5ad.dataset_id}") + continue + + # shape of raw and final must be same shape. Schema 2.0 disallows cell filtering, + # but DOES allow feature/gene filtering. The "census" specification requires that + # any filtered features be added back to the final layer. + if ad.raw is not None: + missing_from_var = ad.raw.var.index.difference(ad.var.index) + if len(missing_from_var) > 0: + raw_var = ad.raw.var.loc[missing_from_var].copy() + raw_var["feature_is_filtered"] = True + # TODO - these should be looked up in the ontology + raw_var["feature_name"] = "unknown" + raw_var["feature_reference"] = "unknown" + new_var = pd.concat([ad.var, raw_var]) + if ad.isbacked: + ad = ad.to_memory() + ad.X.resize(ad.n_obs, len(new_var)) + ad = anndata.AnnData(X=ad.X, obs=ad.obs, var=new_var, raw=ad.raw, dtype=np.float32) + + # sanity checks & expectations for any AnnData we can handle + if ad.raw is not None: + assert ad.X.shape == ad.raw.X.shape + assert len(ad.raw.var) == len(ad.var) + assert len(ad.raw.var.index.difference(ad.var.index)) == 0 + assert len(ad.var.index.difference(ad.raw.var.index)) == 0 + assert ad.X.shape == (len(ad.obs), len(ad.var)) + + # TODO: In principle, we could look up missing feature_name, but for now, just assert they exist + assert ((ad.var.feature_name != "") & (ad.var.feature_name != None)).all() # noqa: E711 + + yield (h5ad, ad) + + +class AnnDataFilterFunction(Protocol): + def __call__(self, ad: anndata.AnnData, retain_X: Optional[bool] = True) -> anndata.AnnData: + ... + + +def make_anndata_cell_filter(filter_spec: AnnDataFilterSpec) -> AnnDataFilterFunction: + """ + Return an anndata sliced/filtered for those cells/genes of interest. + + obs filter: + * not organoid or cell culture + * Caller-specified assays only + * Caller-specified taxa (obs.organism_ontology_term_id == '') + * Organism term ID value not equal to gene feature_reference value + + var filter: + * genes only (var.feature_biotype == 'gene') + """ + organism_ontology_term_id = filter_spec.get("organism_ontology_term_id", None) + assay_ontology_term_ids = filter_spec.get("assay_ontology_term_ids", None) + + def _filter(ad: anndata.AnnData, retain_X: Optional[bool] = True) -> anndata.AnnData: + obs_mask = ~( # noqa: E712 + ad.obs.tissue_ontology_term_id.str.endswith(" (organoid)") + | ad.obs.tissue_ontology_term_id.str.endswith(" (cell culture)") + ) + + if organism_ontology_term_id is not None: + obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == organism_ontology_term_id) + if assay_ontology_term_ids is not None: + obs_mask = obs_mask & ad.obs.assay_ontology_term_id.isin(RNA_SEQ) + + # multi-organism dataset cell filter - exclude any cells where organism != feature_reference + feature_references = set(ad.var.feature_reference.unique()) - FEATURE_REFERENCE_IGNORE + assert len(feature_references) == 1 # else there is a bug in open_anndata + feature_reference_organism_ontology_id = feature_references.pop() + obs_mask = obs_mask & (ad.obs.organism_ontology_term_id == feature_reference_organism_ontology_id) + + # This does NOT slice raw on the var axis. + # See https://anndata.readthedocs.io/en/latest/generated/anndata.AnnData.raw.html + ad = ad[obs_mask, (ad.var.feature_biotype == "gene")] + + obs = ad.obs + var = ad.var + var.index.rename("feature_id", inplace=True) + X = ad.X if retain_X else None + raw = ad.raw if retain_X and ad.n_obs > 0 else None + + if raw: + # remove non-gene features + mask = ad.raw.var.feature_biotype == "gene" + raw = anndata.AnnData(X=ad.raw.X[:, mask], obs=ad.obs, var=ad.raw.var[mask], dtype=np.float32) + + # sanity checks + if raw is not None: + assert ad.var.index.difference(raw.var.index).empty + assert raw.var.index.difference(ad.var.index).empty + assert ad.X.shape == raw.X.shape + + # this dumps all other ancillary state, eg, obsm/varm/.... + ad = anndata.AnnData(X=X, obs=obs, var=var, raw=raw, dtype=np.float32) + return ad + + return _filter + + +def get_cellxgene_schema_version(ad: anndata.AnnData) -> str: + + # cellxgene >=2.0 + if "schema_version" in ad.uns: + # not sure why this is a nested array + return str(ad.uns["schema_version"]) + + # cellxgene 1.X + if "version" in ad.uns: + return str(ad.uns["version"]["corpora_schema_version"]) + + return "" diff --git a/cell_census_builder/census_summary.py b/cell_census_builder/census_summary.py new file mode 100644 index 000000000..ce74f7aa8 --- /dev/null +++ b/cell_census_builder/census_summary.py @@ -0,0 +1,40 @@ +import logging +from typing import Sequence + +import pandas as pd +import pyarrow as pa +import tiledbsoma as soma + +from .experiment_builder import ExperimentBuilder, get_summary_stats +from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, TileDB_Ctx +from .util import pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat + + +def create_census_summary( + info_collection: soma.Collection, experiment_builders: Sequence[ExperimentBuilder], build_tag: str +) -> None: + logging.info("Creating census summary") + + summary_stats = get_summary_stats(experiment_builders) + data = [ + ("cell_census_schema_version", CENSUS_SCHEMA_VERSION), + ("cell_census_build_date", build_tag), + ("total_cell_count", str(summary_stats["total_cell_count"])), + ("unique_cell_count", str(summary_stats["unique_cell_count"])), + ("number_donors_homo_sapiens", str(summary_stats["number_donors"]["homo_sapiens"])), + ("number_donors_mus_musculus", str(summary_stats["number_donors"]["mus_musculus"])), + ] + + df = pd.DataFrame.from_records(data, columns=["label", "value"]) + df["soma_joinid"] = range(len(df)) + + # TODO: work-around for TileDB-SOMA#274. Remove when fixed. + df = pandas_dataframe_strings_to_ascii_issue_247_workaround(df) + + # write to a SOMA dataframe + summary_uri = uricat(info_collection.uri, CENSUS_SUMMARY_NAME) + summary = soma.DataFrame(summary_uri, ctx=TileDB_Ctx()) + summary.create(pa.Schema.from_pandas(df, preserve_index=False), index_column_names=["soma_joinid"]) + for batch in pa.Table.from_pandas(df, preserve_index=False).to_batches(): + summary.write(batch) + info_collection.set(CENSUS_SUMMARY_NAME, summary, relative=True) diff --git a/cell_census_builder/consolidate.py b/cell_census_builder/consolidate.py new file mode 100644 index 000000000..099f08dfc --- /dev/null +++ b/cell_census_builder/consolidate.py @@ -0,0 +1,37 @@ +import logging + +import tiledbsoma as soma + +if soma.get_storage_engine() == "tiledb": + import tiledb + + +def consolidate(uri: str) -> None: + """ + This is a non-portable, TileDB-specific consolidation routine. + """ + if soma.get_storage_engine() != "tiledb": + return + + census = soma.Collection(uri) + if not census.exists(): + return + + consolidate_collection(census) + + +def consolidate_collection(collection: soma.Collection) -> None: + for soma_obj in collection.values(): + type = soma_obj.soma_type + if type in ["SOMADataFrame", "SOMASparseNdArray", "SOMADenseNdArray"]: + logging.info(f"Consolidating {type} {soma_obj.uri}") + consolidate_tiledb_object(soma_obj.uri) + elif type in ["SOMACollection", "SOMAExperiment", "SOMAMeasurement"]: + consolidate_collection(soma_obj) + else: + raise TypeError(f"Unknown SOMA type {type}.") + + +def consolidate_tiledb_object(uri: str) -> None: + tiledb.consolidate(uri, config=tiledb.Config({"sm.consolidation.buffer_size": 1 * 1024**3})) + tiledb.vacuum(uri) diff --git a/cell_census_builder/datasets.py b/cell_census_builder/datasets.py new file mode 100644 index 000000000..6e2217dcb --- /dev/null +++ b/cell_census_builder/datasets.py @@ -0,0 +1,83 @@ +import dataclasses +import logging +from typing import List, Type, TypeVar + +import pandas as pd +import pyarrow as pa +import tiledbsoma as soma + +from .globals import CENSUS_DATASETS_COLUMNS, CENSUS_DATASETS_NAME, TileDB_Ctx +from .util import pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat + +T = TypeVar("T", bound="Dataset") + + +@dataclasses.dataclass +class Dataset: + """ + Type used to handle source H5AD datasets read from manifest + """ + + # Required + dataset_id: str # CELLxGENE dataset_id + corpora_asset_h5ad_uri: str # the URI from which we originally read the H5AD asset + dataset_h5ad_path: str = "" # set after staging, required by end of process + + # Optional + dataset_title: str = "" # CELLxGENE dataset title + collection_id: str = "" # CELlxGENE collection id + collection_name: str = "" # CELlxGENE collection name + collection_doi: str = "" # CELLxGENE collection doi + asset_h5ad_filesize: int = -1 + + # Optional, inferred from data if not already known + schema_version: str = "" # empty string if version unknown + dataset_total_cell_count: int = 0 # number of cells in the census by dataset + + # Assigned late in the game, only to datasets we incorporate into the census + soma_joinid: int = -1 + + def __post_init__(self) -> None: + """ + Type contracts - downstream code assume these types, so enforce it. + """ + for f in dataclasses.fields(self): + assert isinstance( + getattr(self, f.name), f.type + ), f"{f.name} has incorrect type, expected {f.type}, got {type(getattr(self,f.name))}" + + @classmethod + def to_dataframe(cls: Type[T], datasets: List[T]) -> pd.DataFrame: + if len(datasets) == 0: + return pd.DataFrame({field.name: pd.Series(dtype=field.type) for field in dataclasses.fields(cls)}) + + return pd.DataFrame(datasets) + + @classmethod + def from_dataframe(cls: Type[T], datasets: pd.DataFrame) -> List["Dataset"]: + return [Dataset(**r) for r in datasets.to_dict("records")] + + +def assign_soma_joinids(datasets: List[Dataset]) -> None: + for joinid, dataset in enumerate(datasets): + dataset.soma_joinid = joinid + + +def create_dataset_manifest(info_collection: soma.Collection, datasets: List[Dataset]) -> None: + """ + Write the Cell Census `census_datasets` dataframe + """ + logging.info("Creating dataset_manifest") + manifest_df = Dataset.to_dataframe(datasets) + manifest_df = manifest_df[CENSUS_DATASETS_COLUMNS + ["soma_joinid"]] + + # TODO: work-around for TileDB-SOMA#274. Remove when fixed. + manifest_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(manifest_df) + + # write to a SOMA dataframe + manifest_uri = uricat(info_collection.uri, CENSUS_DATASETS_NAME) + manifest = soma.DataFrame(manifest_uri, ctx=TileDB_Ctx()) + manifest.create(pa.Schema.from_pandas(manifest_df, preserve_index=False), index_column_names=["soma_joinid"]) + for batch in pa.Table.from_pandas(manifest_df, preserve_index=False).to_batches(): + manifest.write(batch) + info_collection.set(CENSUS_DATASETS_NAME, manifest, relative=True) diff --git a/cell_census_builder/experiment_builder.py b/cell_census_builder/experiment_builder.py new file mode 100644 index 000000000..945838f08 --- /dev/null +++ b/cell_census_builder/experiment_builder.py @@ -0,0 +1,566 @@ +import argparse +import concurrent.futures +import gc +import io +import logging +from enum import IntEnum +from typing import List, Optional, Sequence, Tuple, TypedDict, Union, overload + +import anndata +import numpy as np +import numpy.typing as npt +import pandas as pd +import pyarrow as pa +import tiledbsoma as soma +from scipy import sparse + +from .anndata import AnnDataFilterSpec, make_anndata_cell_filter, open_anndata +from .datasets import Dataset +from .globals import ( + CENSUS_OBS_TERM_COLUMNS, + CENSUS_VAR_TERM_COLUMNS, + CXG_OBS_TERM_COLUMNS, + DONOR_ID_IGNORE, + X_LAYERS, + TileDB_Ctx, +) +from .mp import create_process_pool_executor +from .source_assets import cat_file +from .summary_cell_counts import accumulate_summary_counts, init_summary_counts_accumulator +from .tissue_mapper import TissueMapper # type: ignore +from .util import ( + anndata_ordered_bool_issue_853_workaround, + array_chunker, + is_positive_integral, + pandas_dataframe_strings_to_ascii_issue_247_workaround, + uricat, +) + +# Contents: +# dataset_id +# dataset_soma_joinid - used as the presence row index +# eb_name +# data - presence COO data +# cols - presence COO col +# +# TODO: convert this to a dataclass or namedtuple. +# +PresenceResult = tuple[str, int, str, npt.NDArray[np.bool_], npt.NDArray[np.int64]] +PresenceResults = tuple[PresenceResult, ...] + +# UBERON tissue term mapper +tissue_mapper: TissueMapper = TissueMapper() + + +class ExperimentBuilder: + """ + Class to help build a parameterized SOMA experiment, where key parameters are: + * experiment "name" (eg, 'human'), must be unique in all experiments. + * an AnnData filter used to cherry pick data for the experiment + * methods to progressively build the experiment + + The creation and driving of these objects is done by the main loop. + """ + + name: str + anndata_cell_filter_spec: AnnDataFilterSpec + gene_feature_length_uris: List[str] + gene_feature_length: pd.DataFrame + build_state: "ExperimentBuilder.BuildState" + + # builder state sanity check, used to catch usage errors. + + class BuildState(IntEnum): + Initialized = 0 + Created = 1 + AxisWritten = 2 + X_Created = 3 + X_JoinIdMetadataCreated = 4 + X_Written = 5 + X_Presence_Written = 6 + + def next(self) -> "ExperimentBuilder.BuildState": + return ExperimentBuilder.BuildState(self.value + 1) + + def __init__( + self, base_uri: str, name: str, anndata_cell_filter_spec: AnnDataFilterSpec, gene_feature_length_uris: List[str] + ): + self.name = name + self.anndata_cell_filter_spec = anndata_cell_filter_spec + self.gene_feature_length_uris = gene_feature_length_uris + self.se_uri = uricat(base_uri, name) + + # accumulated state + self.n_obs: int = 0 + self.n_unique_obs: int = 0 + self.n_var: int = 0 + self.n_datasets: int = 0 + self.n_donors: int = 0 # Caution: defined as (unique dataset_id, donor_id) tuples, *excluding* some values + self.var_df: pd.DataFrame = pd.DataFrame(columns=["feature_id", "feature_name"]) + self.dataset_obs_joinid_start: dict[str, int] + self.census_summary_cell_counts = init_summary_counts_accumulator() + self.presence: dict[int, tuple[npt.NDArray[np.bool_], npt.NDArray[np.int64]]] = {} + self.build_state = ExperimentBuilder.BuildState.Initialized + + self.load_assets() + + def load_assets(self) -> None: + """ + Load any external assets required to create the experiment. + """ + self.gene_feature_length = pd.concat( + pd.read_csv( + io.BytesIO(cat_file(uri)), + names=["feature_id", "feature_name", "gene_version", "feature_length"], + ) + .set_index("feature_id") + .drop(columns=["feature_name", "gene_version"]) + for uri in self.gene_feature_length_uris + ) + logging.info(f"Loaded gene lengths external reference for {self.name}, {len(self.gene_feature_length)} genes.") + + def is_finished(self) -> bool: + return self.build_state == ExperimentBuilder.BuildState.X_Presence_Written + + def create(self, data_collection: soma.Collection) -> None: + assert self.build_state == ExperimentBuilder.BuildState.Initialized + + """Make experiment at `uri` with a single Measurement and add to top-level collection.""" + logging.info(f"{self.name}: create experiment at {self.se_uri}") + + se = soma.Experiment(self.se_uri, ctx=TileDB_Ctx()) + if se.exists(): + logging.error("Census already exists - aborting") + raise Exception("Census already exists") + se.create() + data_collection.set(self.name, se, relative=True) + + # create `ms` + se.set("ms", soma.Collection(uricat(se.uri, "ms")).create(), relative=True) + + # create `obs` + obs_schema = pa.schema(list(CENSUS_OBS_TERM_COLUMNS.items())) + se.set( + "obs", + soma.DataFrame(uricat(se.uri, "obs")).create(obs_schema, index_column_names=["soma_joinid"]), + relative=True, + ) + + # make measurement and add to ms collection + measurement = soma.Measurement(uricat(se.ms.uri, "RNA")).create() + se.ms.set("RNA", measurement, relative=True) + + # make the `var` in the measurement + var_schema = pa.schema(list(CENSUS_VAR_TERM_COLUMNS.items())) + measurement.set( + "var", + soma.DataFrame(uricat(measurement.uri, "var")).create(var_schema, index_column_names=["soma_joinid"]), + relative=True, + ) + + # make the `X` collection (but not the actual layers) + measurement.set("X", soma.Collection(uricat(measurement.uri, "X")).create(), relative=True) + + # make the varp, to later contain the presence matrix + measurement.set("varp", soma.Collection(uricat(measurement.uri, "varp")).create(), relative=True) + + self.build_state = self.build_state.next() + return + + def accumulate_axes(self, dataset: Dataset, ad: anndata.AnnData, progress: Tuple[int, int] = (0, 0)) -> int: + """ + Write obs, accumate var. + + Returns: number of cells that make it past the experiment filter. + """ + progmsg = f"({progress[0]} of {progress[1]})" + logging.info(f"{self.name}: accumulate axis for dataset '{dataset.dataset_id}' {progmsg}") + assert self.build_state == ExperimentBuilder.BuildState.Created + + anndata_cell_filter = make_anndata_cell_filter(self.anndata_cell_filter_spec) + ad = anndata_cell_filter(ad, retain_X=False) + if ad.n_obs == 0: + logging.info(f"{self.name} - H5AD has no data after filtering, skipping {dataset.dataset_h5ad_path}") + return 0 + + # Narrow columns just to minimize memory footprint. Summary cell counting + # requires 'organism', do be careful not to delete that. + obs_df = ad.obs[list(CXG_OBS_TERM_COLUMNS) + ["organism"]].reset_index(drop=True).copy() + + # TODO XXX: Temporary work around pending resolution of TileDB-SOMA#274 + obs_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(obs_df) + + obs_df["soma_joinid"] = range(self.n_obs, self.n_obs + len(obs_df)) + obs_df["dataset_id"] = dataset.dataset_id + + # high-level tissue mapping + add_tissue_mapping(obs_df, dataset.dataset_id) + + # Accumulate aggregation counts + self._accumulate_summary_cell_counts(obs_df) + + # drop columns we don't want to write + obs_df = obs_df[list(CENSUS_OBS_TERM_COLUMNS)] + obs_df = anndata_ordered_bool_issue_853_workaround(obs_df) + + se = soma.Experiment(self.se_uri, ctx=TileDB_Ctx()) + assert se.exists() + + pa_table = pa.Table.from_pandas( + obs_df, + preserve_index=False, + columns=list(CENSUS_OBS_TERM_COLUMNS), + ) + for pa_batch in pa_table.to_batches(): + se.obs.write(pa_batch) + + # Accmulate the union of all var ids/names (for raw and processed), to be later persisted. + # NOTE: assumes raw.var is None, OR has same index as var. Currently enforced in open_anndata(), + # but may need to evolve this logic if that assumption is not scalable. + tv = ad.var.rename_axis("feature_id").reset_index()[["feature_id", "feature_name"]] + self.var_df = pd.concat([self.var_df, tv]).drop_duplicates() + + self.n_obs += len(obs_df) + self.n_unique_obs += (obs_df.is_primary_data == True).sum() # noqa: E712 + + donors = obs_df.donor_id.unique() + self.n_donors += len(donors) - np.isin(donors, DONOR_ID_IGNORE).sum() + + self.n_datasets += 1 + return len(obs_df) + + def commit_axis(self) -> None: + logging.info(f"{self.name}: commit axes") + se = soma.Experiment(self.se_uri) + assert se.exists() + assert self.build_state == ExperimentBuilder.BuildState.Created + + # if is possible there is nothing to write + if len(self.var_df) > 0: + # persist var + self.var_df["soma_joinid"] = range(len(self.var_df)) + self.var_df = self.var_df.join(self.gene_feature_length["feature_length"], on="feature_id") + self.var_df.feature_length.fillna(0, inplace=True) + + # TODO XXX: Temporary work around pending resolution of TileDB-SOMA#274 + self.var_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(self.var_df) + + self.var_df = anndata_ordered_bool_issue_853_workaround(self.var_df) + + se.ms["RNA"].var.write( + pa.RecordBatch.from_pandas( + self.var_df, + preserve_index=False, + columns=list(CENSUS_VAR_TERM_COLUMNS), + ) + ) + + self.n_var = len(self.var_df) + self.build_state = self.build_state.next() + return + + def create_X_layers(self, datasets: List[Dataset]) -> None: + """ + Create layers in ms['RNA']/X + """ + logging.info(f"{self.name}: create X layers") + se = soma.Experiment(self.se_uri, ctx=TileDB_Ctx()) + assert se.exists() + assert se.ms["RNA"].exists() + assert self.n_obs >= 0 and self.n_var >= 0 + assert self.build_state == ExperimentBuilder.BuildState.AxisWritten + assert self.n_obs == 0 or self.n_datasets > 0 + + # SOMA does not currently support empty arrays, so special case this corner-case. + if self.n_obs > 0: + assert self.n_var > 0 + measurement = se.ms["RNA"] + for layer_name in X_LAYERS: + snda = soma.SparseNdArray(uricat(measurement.X.uri, layer_name), ctx=TileDB_Ctx()) + snda.create(pa.float32(), (self.n_obs, self.n_var)) + measurement.X.set(layer_name, snda, relative=True) + + presence_matrix = soma.SparseNdArray( + uricat(measurement.varp.uri, "dataset_presence_matrix"), ctx=TileDB_Ctx() + ) + max_dataset_joinid = max(d.soma_joinid for d in datasets) + presence_matrix.create(pa.bool_(), shape=(max_dataset_joinid + 1, self.n_var)) + measurement.varp.set("dataset_presence_matrix", presence_matrix, relative=True) + + self.build_state = self.build_state.next() + return + + def create_joinid_metadata(self) -> None: + logging.info(f"{self.name}: make joinid metadata") + assert self.build_state >= ExperimentBuilder.BuildState.AxisWritten + se = soma.Experiment(self.se_uri, ctx=TileDB_Ctx()) + assert se.exists() + + # Map of dataset_id -> starting soma_joinid for obs axis. This code _assumes_ + # that soma_joinid is contiguous (ie, no deletions in obs), which is + # known true for our use case (aggregating h5ads). + self.dataset_obs_joinid_start = ( + se.obs.read_as_pandas_all(column_names=["dataset_id", "soma_joinid"]) + .groupby("dataset_id") + .min() + .soma_joinid.to_dict() + ) + + self.build_state = self.build_state.next() + + def commit_X(self, *, consolidate: bool = False) -> None: + logging.info(f"{self.name}: commit X") + assert self.build_state == ExperimentBuilder.BuildState.X_JoinIdMetadataCreated + self.build_state = self.build_state.next() + + def _accumulate_summary_cell_counts(self, obs_df: pd.DataFrame) -> None: + """ + Add summary counts to the census_summary_cell_counts dataframe + """ + assert "dataset_id" in obs_df + assert len(obs_df) > 0 + self.census_summary_cell_counts = accumulate_summary_counts(self.census_summary_cell_counts, obs_df) + + def commit_presence_matrix(self, datasets: List[Dataset]) -> None: + """ + Save presence matrix per Experiment + """ + assert self.build_state == ExperimentBuilder.BuildState.X_Written + + if len(self.presence) > 0: + max_dataset_joinid = max(d.soma_joinid for d in datasets) + + # A few sanity checks + assert len(self.presence) == self.n_datasets + assert max_dataset_joinid >= max(self.presence.keys()) # key is dataset joinid + + # LIL is fast way to create spmatrix + pm = sparse.lil_array((max_dataset_joinid + 1, self.n_var), dtype=bool) + for dataset_joinid, presence in self.presence.items(): + data, cols = presence + pm[dataset_joinid, cols] = data + + pm = pm.tocoo() + pm.eliminate_zeros() + assert pm.count_nonzero() == pm.nnz + assert pm.dtype == bool + se = soma.Experiment(self.se_uri, ctx=TileDB_Ctx()) + se.ms["RNA"].varp["dataset_presence_matrix"].write_sparse_tensor(pa.SparseCOOTensor.from_scipy(pm)) + + self.build_state = self.build_state.next() + + +def _accumulate_all_X_layers( + assets_path: str, + dataset: Dataset, + experiment_builders: List[ExperimentBuilder], + dataset_obs_joinid_starts: List[Union[None, int]], + ms_name: str, + progress: Tuple[int, int], +) -> PresenceResults: + """ + For this dataset, save all X layer information for each Experiment. This currently + includes: + X['raw'] - raw counts + + Also accumulates presence information per dataset. + + This is a helper function for ExperimentBuilder.accumulate_X + """ + gc.collect() + logging.debug(f"Loading AnnData for dataset {dataset.dataset_id} ({progress[0]} of {progress[1]})") + unfiltered_ad = next(open_anndata(assets_path, [dataset]))[1] + assert unfiltered_ad.isbacked is False + + presence = [] + for eb, dataset_obs_joinid_start in zip(experiment_builders, dataset_obs_joinid_starts): + if dataset_obs_joinid_start is None: + # this dataset has no data for this experiment + continue + + se = soma.Experiment(eb.se_uri, ctx=TileDB_Ctx()) + assert se is not None + assert se.exists() + + anndata_cell_filter = make_anndata_cell_filter(eb.anndata_cell_filter_spec) + ad = anndata_cell_filter(unfiltered_ad) + if ad.n_obs == 0: + continue + + # follow CELLxGENE 3.0 schema conventions for raw/X aliasing when only raw counts exist + raw_X, raw_var = (ad.X, ad.var) if ad.raw is None else (ad.raw.X, ad.raw.var) + + if not is_positive_integral(raw_X): + logging.error(f"{dataset.dataset_id} contains non-integer or negative valued data") + + # save X['raw'] + layer_name = "raw" + logging.info( + f"{eb.name}: saving X layer '{layer_name}' for dataset '{dataset.dataset_id}' " + f"({progress[0]} of {progress[1]})" + ) + global_var_joinids = ( + se.ms[ms_name].var.read_as_pandas_all(column_names=["feature_id", "soma_joinid"]).set_index("feature_id") + ) + local_var_joinids = raw_var.join(global_var_joinids).soma_joinid.to_numpy() + assert (local_var_joinids >= 0).all(), f"Illegal join id, {dataset.dataset_id}" + + for n, X in enumerate(array_chunker(raw_X), start=1): + logging.debug(f"{eb.name}/{layer_name}: X chunk {n} {dataset.dataset_id}") + # remap to match axes joinids + row = X.row.astype(np.int64) + dataset_obs_joinid_start + assert (row >= 0).all() + col = local_var_joinids[X.col] + assert (col >= 0).all() + X_remap = sparse.coo_array((X.data, (row, col)), shape=(eb.n_obs, eb.n_var)) + se.ms[ms_name].X[layer_name].write_sparse_tensor(pa.SparseCOOTensor.from_scipy(X_remap)) + gc.collect() + + # Save presence information by dataset_id + assert dataset.soma_joinid >= 0 # i.e., it was assigned prior to this step + pres_data = raw_X.sum(axis=0) > 0 + if isinstance(pres_data, np.matrix): + pres_data = pres_data.A + pres_data = pres_data[0] + pres_cols = local_var_joinids[np.arange(ad.n_vars, dtype=np.int64)] + + assert pres_data.dtype == bool + assert pres_cols.dtype == np.int64 + assert pres_data.shape == (ad.n_vars,) + assert pres_data.shape == pres_cols.shape + assert ad.n_vars <= eb.n_var + + presence.append( + ( + dataset.dataset_id, + dataset.soma_joinid, + eb.name, + pres_data, + pres_cols, + ) + ) + + gc.collect() + return tuple(presence) + + +@overload +def _accumulate_X( + assets_path: str, dataset: Dataset, experiment_builders: List["ExperimentBuilder"], progress: Tuple[int, int] +) -> PresenceResults: + ... + + +@overload +def _accumulate_X( + assets_path: str, + dataset: Dataset, + experiment_builders: List["ExperimentBuilder"], + progress: Tuple[int, int], + executor: Optional[concurrent.futures.Executor], +) -> concurrent.futures.Future[PresenceResults]: + ... + + +def _accumulate_X( + assets_path: str, + dataset: Dataset, + experiment_builders: List["ExperimentBuilder"], + progress: Tuple[int, int], + executor: Optional[concurrent.futures.Executor] = None, +) -> Union[concurrent.futures.Future[PresenceResults], PresenceResults]: + """ + Save X layer data for a single AnnData, for all Experiments. Return a future if + executor is specified, otherwise immediately do the work. + """ + for eb in experiment_builders: + # sanity checks + assert eb.build_state == ExperimentBuilder.BuildState.X_JoinIdMetadataCreated + assert eb.dataset_obs_joinid_start is not None + + dataset_obs_joinid_starts = [ + eb.dataset_obs_joinid_start.get(dataset.dataset_id, None) for eb in experiment_builders + ] + if executor is not None: + return executor.submit( + _accumulate_all_X_layers, + assets_path, + dataset, + experiment_builders, + dataset_obs_joinid_starts, + "RNA", + progress, + ) + else: + return _accumulate_all_X_layers( + assets_path, dataset, experiment_builders, dataset_obs_joinid_starts, "RNA", progress + ) + + +def populate_X_layers( + assets_path: str, datasets: List[Dataset], experiment_builders: List[ExperimentBuilder], args: argparse.Namespace +) -> None: + """ + Do all X layer processing for all Experiments. + """ + + # populate X layers + presence: List[PresenceResult] = [] + if args.multi_process: + with create_process_pool_executor(args) as pe: + + futures = { + _accumulate_X( + assets_path, + dataset, + experiment_builders, + progress=(n, len(datasets)), + executor=pe, + ): dataset + for n, dataset in enumerate(datasets, start=1) + } + + for n, f in enumerate(concurrent.futures.as_completed(futures), start=1): + # propagate exceptions - not expecting any other return values + presence += f.result() + logging.info(f"pass 2, {futures[f].dataset_id} ({n} of {len(futures)}) complete.") + + else: + for n, dataset in enumerate(datasets, start=1): + presence += _accumulate_X(assets_path, dataset, experiment_builders, progress=(n, len(datasets))) + + eb_by_name = {e.name: e for e in experiment_builders} + for _, dataset_soma_joinid, eb_name, pres_data, pres_col in presence: + eb_by_name[eb_name].presence[dataset_soma_joinid] = (pres_data, pres_col) + + +class SummaryStats(TypedDict): + total_cell_count: int + unique_cell_count: int + number_donors: dict[str, int] + + +def get_summary_stats(experiment_builders: Sequence[ExperimentBuilder]) -> SummaryStats: + return { + "total_cell_count": sum(e.n_obs for e in experiment_builders), + "unique_cell_count": sum(e.n_unique_obs for e in experiment_builders), + "number_donors": {e.name: e.n_donors for e in experiment_builders}, + } + + +def add_tissue_mapping(obs_df: pd.DataFrame, dataset_id: str) -> None: + """Inplace addition of tissue_general-related columns""" + + tissue_ids = obs_df.tissue_ontology_term_id.unique() + + # Map specific ID -> general ID + tissue_general_id_map = {id: tissue_mapper.get_high_level_tissue(id) for id in tissue_ids} + if not all(tissue_general_id_map.values()): + logging.error(f"{dataset_id} contains tissue types which could not be generalized.") + obs_df["tissue_general_ontology_term_id"] = obs_df.tissue_ontology_term_id.map(tissue_general_id_map) + + # Assign general label + tissue_general_label_map = { + id: tissue_mapper.get_label_from_writable_id(id) for id in tissue_general_id_map.values() + } + obs_df["tissue_general"] = obs_df.tissue_general_ontology_term_id.map(tissue_general_label_map) diff --git a/cell_census_builder/globals.py b/cell_census_builder/globals.py new file mode 100644 index 000000000..89964d8e7 --- /dev/null +++ b/cell_census_builder/globals.py @@ -0,0 +1,126 @@ +import pyarrow as pa +import tiledb + +CENSUS_SCHEMA_VERSION = "0.0.1" + +CXG_SCHEMA_VERSION = "3.0.0" # version we write to the census +CXG_SCHEMA_VERSION_IMPORT = [CXG_SCHEMA_VERSION] # versions we can ingest + +# Columns expected in the census_datasets dataframe +CENSUS_DATASETS_COLUMNS = [ + "collection_id", + "collection_name", + "collection_doi", + "dataset_id", + "dataset_title", + "dataset_h5ad_path", + "dataset_total_cell_count", +] +CENSUS_DATASETS_NAME = "datasets" # object name + +CENSUS_SUMMARY_CELL_COUNTS_COLUMNS = { + "organism": pa.string(), + "category": pa.string(), + "label": pa.string(), + "ontology_term_id": pa.string(), + "total_cell_count": pa.int64(), + "unique_cell_count": pa.int64(), +} +CENSUS_SUMMARY_CELL_COUNTS_NAME = "summary_cell_counts" # object name + +CENSUS_SUMMARY_NAME = "summary" + +# CXG schema columns we preserve in our census, and the Arrow type to encode as. Schema: +# https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/3.0.0/schema.md +# +# NOTE: a few additional columns are added (they are not defined in the CXG schema), +# eg., dataset_id, tissue_general, etc. +CXG_OBS_TERM_COLUMNS = { + "assay": pa.large_string(), + "assay_ontology_term_id": pa.large_string(), + "cell_type": pa.large_string(), + "cell_type_ontology_term_id": pa.large_string(), + "development_stage": pa.large_string(), + "development_stage_ontology_term_id": pa.large_string(), + "disease": pa.large_string(), + "disease_ontology_term_id": pa.large_string(), + "donor_id": pa.large_string(), + "is_primary_data": pa.bool_(), + "self_reported_ethnicity": pa.large_string(), + "self_reported_ethnicity_ontology_term_id": pa.large_string(), + "sex": pa.large_string(), + "sex_ontology_term_id": pa.large_string(), + "suspension_type": pa.large_string(), + "tissue": pa.large_string(), + "tissue_ontology_term_id": pa.large_string(), +} +CENSUS_OBS_TERM_COLUMNS = { + "soma_joinid": pa.int64(), + "dataset_id": pa.large_string(), + **CXG_OBS_TERM_COLUMNS, + "tissue_general": pa.large_string(), + "tissue_general_ontology_term_id": pa.large_string(), +} + +CENSUS_VAR_TERM_COLUMNS = { + "soma_joinid": pa.int64(), + "feature_id": pa.large_string(), + "feature_name": pa.large_string(), + "feature_length": pa.int64(), +} + +X_LAYERS = [ + "raw", +] + +# list of EFO terms that correspond to RNA seq modality/measurement +RNA_SEQ = [ + "EFO:0008720", # DroNc-seq + "EFO:0008722", # Drop-seq + "EFO:0008780", # inDrop + "EFO:0008913", # single-cell RNA sequencing + "EFO:0008919", # Seq-Well + "EFO:0008930", # Smart-seq + "EFO:0008931", # Smart-seq2 + "EFO:0008953", # STRT-seq + "EFO:0008995", # 10x technology + "EFO:0009899", # 10x 3' v2 + "EFO:0009900", # 10x 5' v2 + "EFO:0009901", # 10x 3' v1 + "EFO:0009922", # 10x 3' v3 + "EFO:0010010", # CEL-seq2 + "EFO:0010183", # single cell library construction + "EFO:0010550", # sci-RNA-seq + "EFO:0011025", # 10x 5' v1 + "EFO:0030002", # microwell-seq + "EFO:0030003", # 10x 3' transcription profiling + "EFO:0030004", # 10x 5' transcription profiling + "EFO:0030019", # Seq-Well S3 + "EFO:0700003", # BD Rhapsody Whole Transcriptome Analysis + "EFO:0700004", # BD Rhapsody Targeted mRNA +] + +DONOR_ID_IGNORE = ["pooled", "unknown"] + +# Feature_reference values which are ignored (not considered) for +# multi-organism filtering. +SARS_COV_2 = "NCBITaxon:2697049" +ERCC_SPIKE_INS = "NCBITaxon:32630" +FEATURE_REFERENCE_IGNORE = {SARS_COV_2, ERCC_SPIKE_INS} + + +""" +Singletons used throughout the package +""" + +# Global TileDB context +_TileDB_Ctx: tiledb.Ctx = None + + +def TileDB_Ctx() -> tiledb.Ctx: + return _TileDB_Ctx + + +def set_tiledb_ctx(ctx: tiledb.Ctx) -> None: + global _TileDB_Ctx + _TileDB_Ctx = ctx diff --git a/cell_census_builder/manifest.py b/cell_census_builder/manifest.py new file mode 100644 index 000000000..3fdef61d0 --- /dev/null +++ b/cell_census_builder/manifest.py @@ -0,0 +1,147 @@ +import concurrent.futures +import csv +import io +import logging +import os.path +from typing import List, Optional, Union + +from .datasets import Dataset +from .globals import CXG_SCHEMA_VERSION_IMPORT +from .util import fetch_json + +CXG_BASE_URI = "https://api.cellxgene.cziscience.com/" + + +def parse_manifest_file(manifest_fp: io.TextIOBase) -> list[Dataset]: + """ + return manifest as list of tuples, (dataset_id, URI/path), read from the text stream + """ + # skip comments and strip leading/trailing white space + skip_comments = csv.reader(row for row in manifest_fp if not row.startswith("#")) + stripped = [[r.strip() for r in row] for row in skip_comments] + return [Dataset(dataset_id=r[0], corpora_asset_h5ad_uri=r[1]) for r in stripped] + + +def dedup_datasets(datasets: List[Dataset]) -> List[Dataset]: + ds = {d.dataset_id: d for d in datasets} + if len(ds) != len(datasets): + logging.warning("Dataset manifest contained DUPLICATES, which will be ignored.") + return list(ds.values()) + return datasets + + +def load_manifest_from_fp(manifest_fp: io.TextIOBase) -> list[Dataset]: + logging.info("Loading manifest from file") + all_datasets = parse_manifest_file(manifest_fp) + datasets = [ + d + for d in all_datasets + if d.corpora_asset_h5ad_uri.endswith(".h5ad") and os.path.exists(d.corpora_asset_h5ad_uri) + ] + if len(datasets) != len(all_datasets): + logging.warning("Manifest contained records which are not H5AD files or which are not accessible - ignoring") + return datasets + + +def null_to_empty_str(val: Union[None, str]) -> str: + if val is None: + return "" + return val + + +def load_manifest_from_CxG() -> list[Dataset]: + logging.info("Loading manifest from CELLxGENE data portal...") + + # Load all collections and extract dataset_id + collections = fetch_json(f"{CXG_BASE_URI}curation/v1/collections") + assert isinstance(collections, list), "Unexpected REST API response, /curation/v1/collections" + datasets = { + dataset["id"]: { + "collection_id": collection["id"], + "collection_name": null_to_empty_str(collection["name"]), + "collection_doi": null_to_empty_str(collection["doi"]), + "dataset_title": dataset.get("title", ""), # title is optional in schema + "dataset_id": dataset["id"], + } + for collection in collections + for dataset in collection["datasets"] + if dataset["tombstone"] is False # ignore anything that has been deleted + } + logging.info(f"Found {len(datasets)} datasets, in {len(collections)} collections") + + # load per-dataset schema version + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as tp: + dataset_metadata = tp.map( + lambda d: fetch_json( + f"{CXG_BASE_URI}curation/v1/collections/{d['collection_id']}/datasets/{d['dataset_id']}" + ), + datasets.values(), + ) + for d in dataset_metadata: + assert ( + isinstance(d, dict) and "id" in d + ), "Unexpected REST API response, /curation/v1/collections/.../datasets/..." + datasets[d["id"]].update( + { + "schema_version": d["schema_version"], + "dataset_title": null_to_empty_str(d["title"]), + } + ) + + # Remove any datasets that don't match our target schema version + obsolete_dataset_ids = [id for id in datasets if datasets[id]["schema_version"] not in CXG_SCHEMA_VERSION_IMPORT] + if len(obsolete_dataset_ids) > 0: + logging.warning(f"Dropping {len(obsolete_dataset_ids)} datasets due to unsupported schema version") + for id in obsolete_dataset_ids: + logging.info(f"Dropping dataset_id {id} due to schema version.") + datasets.pop(id) + + # Grab the asset URI for each dataset + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as tp: + dataset_assets = tp.map( + lambda d: ( + d["dataset_id"], + fetch_json( + f"{CXG_BASE_URI}curation/v1/collections/{d['collection_id']}/datasets/{d['dataset_id']}/assets" + ), + ), + datasets.values(), + ) + no_asset_found = [] + for dataset_id, assets in dataset_assets: + assert isinstance( + assets, list + ), "Unexpected REST API response, /curation/v1/collections/.../datasets/.../assets" + assets_h5ad = [a for a in assets if a["filetype"] == "H5AD"] + if len(assets_h5ad) == 0: + logging.error(f"Unable to find H5AD asset for dataset id {dataset_id} - ignoring this dataset") + no_asset_found.append(dataset_id) + else: + asset = assets_h5ad[0] + datasets[dataset_id].update( + { + "corpora_asset_h5ad_uri": asset["presigned_url"], + "asset_h5ad_filesize": asset["filesize"], + } + ) + + # drop any datasets where we could not find an asset + for id in no_asset_found: + datasets.pop(id, None) + + return [Dataset(**d) for d in datasets.values()] + + +def load_manifest(manifest_fp: Optional[io.TextIOBase] = None) -> list[Dataset]: + """ + Load dataset manifest from the file pointer if provided, else bootstrap + the load rom the CELLxGENE REST API. + """ + if manifest_fp is not None: + datasets = load_manifest_from_fp(manifest_fp) + else: + datasets = load_manifest_from_CxG() + + logging.info(f"Loaded {len(datasets)} datasets.") + datasets = dedup_datasets(datasets) + return datasets diff --git a/cell_census_builder/mp.py b/cell_census_builder/mp.py new file mode 100644 index 000000000..dc59deafd --- /dev/null +++ b/cell_census_builder/mp.py @@ -0,0 +1,50 @@ +import argparse +import concurrent.futures +import logging +import os +from typing import Optional, cast + +import tiledbsoma as soma + +from .globals import set_tiledb_ctx + +if soma.get_storage_engine() == "tiledb": + import tiledb + + +def cpu_count() -> int: + """Sign, os.cpu_count() returns None if "undetermined" number of CPUs""" + cpu_count = os.cpu_count() + if os.cpu_count() is None: + return 1 + return cast(int, cpu_count) + + +def process_initializer(verbose: int = 0) -> None: + level = logging.DEBUG if verbose > 1 else logging.INFO if verbose == 1 else logging.WARNING + logging.basicConfig( + format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s", + level=level, + datefmt="%Y-%m-%d %H:%M:%S", + ) + logging.captureWarnings(True) + + if soma.get_storage_engine() == "tiledb": + set_tiledb_ctx( + tiledb.Ctx( + { + "py.init_buffer_bytes": 512 * 1024**2, + "py.deduplicate": "true", + } + ) + ) + + +def create_process_pool_executor( + args: argparse.Namespace, max_workers: Optional[int] = None +) -> concurrent.futures.ProcessPoolExecutor: + return concurrent.futures.ProcessPoolExecutor( + max_workers=args.max_workers if max_workers is None else max_workers, + initializer=process_initializer, + initargs=(args.verbose,), + ) diff --git a/cell_census_builder/requirements.txt b/cell_census_builder/requirements.txt new file mode 100644 index 000000000..22da57df2 --- /dev/null +++ b/cell_census_builder/requirements.txt @@ -0,0 +1,16 @@ +pyarrow +pandas +anndata +numpy +tiledb +# NOTE: Until tiledbsoma is available on PyPi, you will need to build this dependency +# from source, per ./notebooks/README.md. +# tiledbsoma>=0.5.0 +scipy +fsspec +s3fs +requests +aiohttp +Cython # required by owlready2 +wheel # required by owlready2 +owlready2 diff --git a/cell_census_builder/source_assets.py b/cell_census_builder/source_assets.py new file mode 100644 index 000000000..244f4f8a8 --- /dev/null +++ b/cell_census_builder/source_assets.py @@ -0,0 +1,58 @@ +import argparse +import logging +import os +import urllib.parse +from typing import List, Tuple, cast + +import aiohttp +import fsspec + +from .datasets import Dataset +from .mp import cpu_count, create_process_pool_executor + + +def stage_source_assets(datasets: List[Dataset], args: argparse.Namespace, assets_dir: str) -> None: + + logging.info(f"Starting asset staging to {assets_dir}") + assert os.path.isdir(assets_dir) + + # Fetch datasets largest first, to minimize overall download time + datasets = sorted(datasets, key=lambda d: d.asset_h5ad_filesize, reverse=True) + + N = len(datasets) + n_workers = max(min(8, cpu_count()), 64) + with create_process_pool_executor(args, n_workers) as pe: + paths = [ + path + for path in pe.map(copy_file, ((n, dataset, assets_dir, N) for n, dataset in enumerate(datasets, start=1))) + ] + + for i in range(len(datasets)): + datasets[i].dataset_h5ad_path = paths[i] + + +def _copy_file(n: int, dataset: Dataset, asset_dir: str, N: int) -> str: + HTTP_GET_TIMEOUT_SEC = 2 * 60 * 60 # just a very big timeout + protocol = urllib.parse.urlparse(dataset.corpora_asset_h5ad_uri).scheme + fs = fsspec.filesystem( + protocol, + client_kwargs={"timeout": aiohttp.ClientTimeout(total=HTTP_GET_TIMEOUT_SEC, connect=None)}, + ) + dataset_file_name = f"{dataset.dataset_id}.h5ad" + dataset_path = f"{asset_dir}/{dataset_file_name}" + + logging.info(f"Staging {dataset.dataset_id} ({n} of {N}) to {dataset_path}") + fs.get_file(dataset.corpora_asset_h5ad_uri, dataset_path) + logging.info(f"Staging {dataset.dataset_id} ({n} of {N}) complete") + return dataset_file_name + + +def copy_file(args: Tuple[int, Dataset, str, int]) -> str: + return _copy_file(*args) + + +def cat_file(url: str) -> bytes: + with fsspec.open(url, compression="infer") as f: + content = cast(bytes, f.read()) # fsspec has no typing, yet + + return content diff --git a/cell_census_builder/summary_cell_counts.py b/cell_census_builder/summary_cell_counts.py new file mode 100644 index 000000000..f33e1b51a --- /dev/null +++ b/cell_census_builder/summary_cell_counts.py @@ -0,0 +1,128 @@ +import logging +from typing import Sequence + +import numpy as np +import pandas as pd +import pyarrow as pa +import tiledbsoma as soma + +from .globals import CENSUS_SUMMARY_CELL_COUNTS_COLUMNS, CENSUS_SUMMARY_CELL_COUNTS_NAME, TileDB_Ctx +from .util import ( + anndata_ordered_bool_issue_853_workaround, + pandas_dataframe_strings_to_ascii_issue_247_workaround, + uricat, +) + + +def create_census_summary_cell_counts( + info_collection: soma.Collection, per_experiment_summary: Sequence[pd.DataFrame] +) -> None: + """ + Save per-category counts as the census_summary_cell_counts SOMA dataframe + """ + logging.info("Creating census_summary_cell_counts") + df = ( + pd.concat(per_experiment_summary, ignore_index=True) + .drop(columns=["dataset_id"]) + .groupby(by=["organism", "category", "ontology_term_id"], as_index=False, observed=True) + .agg({"unique_cell_count": "sum", "total_cell_count": "sum", "label": "first"}) + ) + df["soma_joinid"] = df.index.astype(np.int64) + + # TODO: work-around for TileDB-SOMA#274. Remove when fixed. + df = pandas_dataframe_strings_to_ascii_issue_247_workaround(df) + df = anndata_ordered_bool_issue_853_workaround(df) + + # write to a SOMA dataframe + summary_counts_uri = uricat(info_collection.uri, CENSUS_SUMMARY_CELL_COUNTS_NAME) + summary_counts = soma.DataFrame(summary_counts_uri, ctx=TileDB_Ctx()) + summary_counts.create(pa.Schema.from_pandas(df, preserve_index=False), index_column_names=["soma_joinid"]) + for batch in pa.Table.from_pandas(df, preserve_index=False).to_batches(): + summary_counts.write(batch) + info_collection.set(CENSUS_SUMMARY_CELL_COUNTS_NAME, summary_counts, relative=True) + + +def init_summary_counts_accumulator() -> pd.DataFrame: + return pd.DataFrame( + data={ + "dataset_id": pd.Series([], dtype=str), + **{ + name: pd.Series([], dtype=arrow_type.to_pandas_dtype()) + for name, arrow_type in CENSUS_SUMMARY_CELL_COUNTS_COLUMNS.items() + }, + } + ) + + +def accumulate_summary_counts(current: pd.DataFrame, obs_df: pd.DataFrame) -> pd.DataFrame: + """ + Add summary counts to the census_summary_cell_counts dataframe + """ + assert "dataset_id" in obs_df + assert len(obs_df) > 0 + + CATEGORIES = [ + # term_id, label + ("cell_type_ontology_term_id", "cell_type"), + ("assay_ontology_term_id", "assay"), + ("tissue_ontology_term_id", "tissue"), + ("disease_ontology_term_id", "disease"), + ("self_reported_ethnicity_ontology_term_id", "self_reported_ethnicity"), + ("sex_ontology_term_id", "sex"), + ("tissue_general_ontology_term_id", "tissue_general"), + (None, "suspension_type"), + ] + + dfs = [] + for term_id, term_label in CATEGORIES: + cats = [] + columns = {} + assert term_id is not None or term_label is not None + if term_id is not None: + cats.append(term_id) + columns.update({term_id: "ontology_term_id"}) + if term_label is not None: + cats.append(term_label) + columns.update({term_label: "label"}) + assert len(cats) > 0 and len(columns) > 0 # i.e., one or both of term or label are specified + + df = obs_df[["dataset_id", "organism", *cats, "is_primary_data"]].rename(columns=columns) + if "label" not in df: + df["label"] = "na" + if "ontology_term_id" not in df: + df["ontology_term_id"] = "na" + + counts = ( + df.value_counts() + .to_frame(name="count") + .reset_index(level="is_primary_data") + .pivot_table( + values="count", + columns="is_primary_data", + index=["organism", "ontology_term_id", "label"], + fill_value=0, + ) + ) + if True not in counts: + counts[True] = 0 + if False not in counts: + counts[False] = 0 + + counts["category"] = term_label if term_label is not None else term_id + counts["unique_cell_count"] = counts[True] + counts["total_cell_count"] = counts[True] + counts[False] + counts = counts.drop(columns=[True, False]).reset_index() + dfs.append(counts) + + all = pd.DataFrame( + data={ + "dataset_id": [obs_df.iloc[0].dataset_id], + "organism": [obs_df.iloc[0].organism], + "ontology_term_id": ["na"], + "label": ["na"], + "category": ["all"], + "unique_cell_count": [dfs[0].unique_cell_count.sum()], + "total_cell_count": [dfs[0].total_cell_count.sum()], + } + ) + return pd.concat([current, all, *dfs], ignore_index=True) diff --git a/cell_census_builder/tissue_mapper.py b/cell_census_builder/tissue_mapper.py new file mode 100644 index 000000000..073e44a6e --- /dev/null +++ b/cell_census_builder/tissue_mapper.py @@ -0,0 +1,260 @@ +# type: ignore +# isort:skip_file +# flake8: noqa +""" +NOTE: This is a (literal) copy of +https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py + +This code should not be duplicated, but rather repackaged to be an importable sub-dependency. + +This code contains several places that do not pass the lint/static analysis CI for this pipeline, so the analysis is disabled in this prologue. +""" + +import owlready2 +from typing import List + + +class TissueMapper: + + # Name of anatomical structure, used to determine the set of ancestors for a given + # entity that we"re interested in. + ANATOMICAL_STRUCTURE_NAME = "UBERON_0000061" + + # List of high level tissues, ORDER MATTERS. If for a given tissue there are multiple high-level tissues associated + # then `self.get_high_level_tissue()` returns the one that appears first in th this list + HIGH_LEVEL_TISSUES = [ + "UBERON_0000178", # blood + "UBERON_0002048", # lung + "UBERON_0002106", # spleen + "UBERON_0002371", # bone marrow + "UBERON_0002107", # liver + "UBERON_0002113", # kidney + "UBERON_0000955", # brain + "UBERON_0002240", # spinal cord + "UBERON_0000310", # breast + "UBERON_0000948", # heart + "UBERON_0002097", # skin of body + "UBERON_0000970", # eye + "UBERON_0001264", # pancreas + "UBERON_0001043", # esophagus + "UBERON_0001155", # colon + "UBERON_0000059", # large intestine + "UBERON_0002108", # small intestine + "UBERON_0000160", # intestine + "UBERON_0000945", # stomach + "UBERON_0001836", # saliva + "UBERON_0001723", # tongue + "UBERON_0001013", # adipose tissue + "UBERON_0000473", # testis + "UBERON_0002367", # prostate gland + "UBERON_0000057", # urethra + "UBERON_0000056", # ureter + "UBERON_0003889", # fallopian tube + "UBERON_0000995", # uterus + "UBERON_0000992", # ovary + "UBERON_0002110", # gall bladder + "UBERON_0001255", # urinary bladder + "UBERON_0018707", # bladder organ + "UBERON_0000922", # embryo + "UBERON_0004023", # ganglionic eminence --> this a part of the embryo, remove in case generality is desired + "UBERON_0001987", # placenta + "UBERON_0007106", # chorionic villus + "UBERON_0002369", # adrenal gland + "UBERON_0002368", # endocrine gland + "UBERON_0002365", # exocrine gland + "UBERON_0000030", # lamina propria + "UBERON_0000029", # lymph node + "UBERON_0004536", # lymph vasculature + "UBERON_0001015", # musculature + "UBERON_0000004", # nose + "UBERON_0003688", # omentum + "UBERON_0000977", # pleura + "UBERON_0002370", # thymus + "UBERON_0002049", # vasculature + "UBERON_0009472", # axilla + "UBERON_0001087", # pleural fluid + "UBERON_0000344", # mucosa + "UBERON_0001434", # skeletal system + "UBERON_0002228", # rib + "UBERON_0003129", # skull + "UBERON_0004537", # blood vasculature + "UBERON_0002405", # immune system + "UBERON_0001009", # circulatory system + "UBERON_0001007", # digestive system + "UBERON_0001017", # central nervous system + "UBERON_0001008", # renal system + "UBERON_0000990", # reproductive system + "UBERON_0001004", # respiratory system + "UBERON_0000010", # peripheral nervous system + "UBERON_0001032", # sensory system + "UBERON_0002046", # thyroid gland + "UBERON_0004535", # cardiovascular system + "UBERON_0000949", # endocrine system + "UBERON_0002330", # exocrine system + "UBERON_0002390", # hematopoietic system + "UBERON_0000383", # musculature of body + "UBERON_0001465", # knee + "UBERON_0001016", # nervous system + "UBERON_0001348", # brown adipose tissue + "UBERON_0015143", # mesenteric fat pad + "UBERON_0000175", # pleural effusion + "UBERON_0001416", # skin of abdomen + "UBERON_0001868", # skin of chest + "UBERON_0001511", # skin of leg + "UBERON_0002190", # subcutaneous adipose tissue + "UBERON_0035328", # upper outer quadrant of breast + "UBERON_0000014", # zone of skin + ] + + # Terms to ignore when mapping + DENY_LIST = [ + "BFO_0000004", + "CARO_0000000", + "CARO_0030000", + "CARO_0000003", + "NCBITaxon_6072", + "Thing", + "UBERON_0000465", # material anatomical entity + "UBERON_0001062", # anatomical entity + ] + + def __init__(self, uberon_ontology: str = "http://purl.obolibrary.org/obo/uberon.owl"): + # TODO: use the pinned ontology at `single-cell-curation` + self._uberon = owlready2.get_ontology(uberon_ontology) + self._uberon.load() + self._cached_tissues = {} + self._cached_labels = {} + + def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str: + """ + Returns the associated high-level tissue ontology term ID from any other ID + Edge cases: + - If multiple high-level tissues exists for a given tissue, returns the one with higher priority (the first + appearance in list self.HIGH_LEVEL_TISSUES. + - If no high-level tissue is found, returns the same as input. + - If the input tissue is not found in the ontology, return the same as input. + - This could happen with something like "UBERON:0002048 (cell culture)" + """ + + tissue_ontology_term_id = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=False) + + if tissue_ontology_term_id in self._cached_tissues: + # If we have looked this up already + return self._cached_tissues[tissue_ontology_term_id] + + entity = self._get_entity_from_id(tissue_ontology_term_id) + + if not entity: + # If not found as an ontology ID return itself + result = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=True) + self._cached_tissues[tissue_ontology_term_id] = result + return result + + # List ancestors for this entity, including itself. Ignore any ancestors that + # are not descendents of UBERON_0000061 (anatomical structure). + ancestors = [entity.name] + branch_ancestors = [] + for is_a in entity.is_a: + branch_ancestors = self._list_ancestors(is_a, branch_ancestors) + + # Include this branch of ancestors is under anatomical structure + if self.ANATOMICAL_STRUCTURE_NAME in branch_ancestors: + ancestors.extend(branch_ancestors) + + # Check if there's at least one top-level entity in the list of ancestors + # for this entity + selected_tissue = tissue_ontology_term_id + for high_level_tissue in self.HIGH_LEVEL_TISSUES: + if high_level_tissue in ancestors: + selected_tissue = high_level_tissue + break + + result = self.reformat_ontology_term_id(selected_tissue, to_writable=True) + self._cached_tissues[tissue_ontology_term_id] = result + return result + + def get_label_from_writable_id(self, ontology_term_id: str): + """ + Returns the label from and ontology term id that is in writable form + Example: "UBERON:0002048" returns "lung" + Example: "UBERON_0002048" raises ValueError because the ID is not in writable form + """ + + if ontology_term_id in self._cached_labels: + return self._cached_labels[ontology_term_id] + + entity = self._get_entity_from_id(self.reformat_ontology_term_id(ontology_term_id, to_writable=False)) + if entity: + result = entity.label[0] + else: + result = ontology_term_id + + self._cached_labels[ontology_term_id] = result + return result + + @staticmethod + def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True): + """ + Converts ontology term id string between two formats: + - `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048" + - `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048" + """ + + if to_writable: + if ontology_term_id.count("_") != 1: + raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one '_'") + return ontology_term_id.replace("_", ":") + else: + if ontology_term_id.count(":") != 1: + raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'") + return ontology_term_id.replace(":", "_") + + def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: List[str] = []) -> List[str]: + """ + Recursive function that given an entity of an ontology, it traverses the ontology and returns + a list of all ancestors associated with the entity. + """ + + if self._is_restriction(entity): + # Entity is a restriction, check for part_of relationship + + prop = entity.property.name + if prop != "BFO_0000050": + # BFO_0000050 is "part of" + return ancestors + ancestors.append(entity.value.name.replace("obo.", "")) + + # Check for ancestors of restriction + self._list_ancestors(entity.value, ancestors) + return ancestors + + elif self._is_entity(entity) and not self._is_and_object(entity): + # Entity is a superclass, check for is_a relationships + + if entity.name in self.DENY_LIST: + return ancestors + ancestors.append(entity.name) + + # Check for ancestors of superclass + for super_entity in entity.is_a: + self._list_ancestors(super_entity, ancestors) + return ancestors + + def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass: + """ + Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity + """ + # TODO: use the pinned ontology at `single-cell-curation` + return self._uberon.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}") + + @staticmethod + def _is_restriction(entity: owlready2.entity.ThingClass) -> bool: + return hasattr(entity, "value") + + @staticmethod + def _is_entity(entity: owlready2.entity.ThingClass) -> bool: + return hasattr(entity, "name") + + @staticmethod + def _is_and_object(entity: owlready2.entity.ThingClass) -> bool: + return hasattr(entity, "Classes") diff --git a/cell_census_builder/tools/aws/mount_instance_storage.sh b/cell_census_builder/tools/aws/mount_instance_storage.sh new file mode 100644 index 000000000..e9c444ffa --- /dev/null +++ b/cell_census_builder/tools/aws/mount_instance_storage.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +# This automates mounting all instance (ephemeral) storage onto a file +# system. If a single device is found, it creates an ext4 file system. +# If multiple are found, it creates a RAID0 group, and an ext4 file +# system on top of it. +# +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/add-instance-store-volumes.html +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/raid-config.html +# + +# exit immediately when a command fails +set -e +# treat unset variables as an error and exit immediately +set -u +# echo each line of the script to stdout so we can see what is happening +# to turn off echo do 'set +o xtrace' +set -o xtrace + + +DEVICE_PREFIX="nvme" +MOUNTPOINT="/mnt/scratch" +RAID_VOLUME="/dev/md0" +VOLUME_LABEL="scratch_volume" + + +# Must be run as privledged user +if [[ $(id -u) != 0 ]]; then + echo "ERROR: not root. You must run using sudo. Exiting with no action taken." + exit +fi + +# Test for a conflict on the mount point +if grep -qs ' ${MOUNTPOINT} ' /proc/mounts; then + echo "ERROR: ${MOUNTPOINT} aleady in use. Exiting with no action taken." + exit +fi + + +# Detect all blcok devices that are disks, and do not have +# partitions or other holder devices (eg, part of raid group, etc) +function detect_devices { + PY_CMD=' +import sys, json +device_prefix = sys.argv[1] +bdevs = [ + dev for dev in json.load(sys.stdin)["blockdevices"] + if dev["type"] == "disk" and "children" not in dev and dev["name"].startswith(device_prefix) +] +for d in bdevs: + name = d["name"] + print(f"/dev/{name}") +' + lsblk --json --output NAME,TYPE,MOUNTPOINT | python3 -c "${PY_CMD}" "$1" +} + +function create_volume { + devices_count=$(wc -w <<< $@) + if [[ ${devices_count} == 0 ]]; then + echo "No devices found, no volume created." + exit 1 + elif [[ ${devices_count} == 1 ]]; then + echo "Found single device, creating volume" + mkfs.ext4 -L ${VOLUME_LABEL} $@ + else + echo "Found ${devices_count} devices, creating RAID0 volume" + mdadm --create --verbose ${RAID_VOLUME} --level=0 --name=${VOLUME_LABEL} --raid-devices=${devices_count} $@ + mkfs.ext4 -L ${VOLUME_LABEL} ${RAID_VOLUME} + fi +} + +function mount_volume { + mkdir -p ${MOUNTPOINT} + mount LABEL=${VOLUME_LABEL} ${MOUNTPOINT} + chmod 777 ${MOUNTPOINT} +} + +create_volume $(detect_devices ${DEVICE_PREFIX}) +mount_volume +echo "Done. Mounted on ${MOUNTPOINT}." diff --git a/cell_census_builder/tools/aws/swapon_instance_storage.sh b/cell_census_builder/tools/aws/swapon_instance_storage.sh new file mode 100644 index 000000000..bd9a086f0 --- /dev/null +++ b/cell_census_builder/tools/aws/swapon_instance_storage.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +# This automates adding all instance (ephemeral) storage as swap +# +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html + +# exit immediately when a command fails +set -e +# treat unset variables as an error and exit immediately +set -u +# echo each line of the script to stdout so we can see what is happening +# to turn off echo do 'set +o xtrace' +set -o xtrace + +DEVICE_PREFIX="nvme" + +# Must be run as privledged user +if [[ $(id -u) != 0 ]]; then + echo "ERROR: not root. You must run using sudo. Exiting with no action taken." + exit +fi + +# Detect all blcok devices that are disks, and do not have +# partitions or other holder devices (eg, part of raid group, etc) +function detect_devices { + PY_CMD=' +import sys, json +device_prefix = sys.argv[1] +bdevs = [ + dev for dev in json.load(sys.stdin)["blockdevices"] + if dev["type"] == "disk" and "children" not in dev and dev["name"].startswith(device_prefix) +] +for d in bdevs: + name = d["name"] + print(f"/dev/{name}") +' + lsblk --json --output NAME,TYPE,MOUNTPOINT | python3 -c "${PY_CMD}" "$1" +} + +for bdev in $(detect_devices ${DEVICE_PREFIX}); do + echo "Adding ${bdev}" + mkswap ${bdev} + swapon -v ${bdev} +done + +echo "Done, swapping on devices:" +swapon -s diff --git a/cell_census_builder/util.py b/cell_census_builder/util.py new file mode 100644 index 000000000..678b2a0e8 --- /dev/null +++ b/cell_census_builder/util.py @@ -0,0 +1,128 @@ +import urllib.parse +from typing import Any, Union +from warnings import warn + +import numpy as np +import numpy.typing as npt +import pandas as pd +import requests +from scipy import sparse + + +def array_chunker(arr: Union[npt.NDArray[Any], sparse.spmatrix]) -> sparse.coo_matrix: + """ + Return the array as multiple chunks, each a coo_matrix. + """ + nnz_chunk_size = 256 * 1024**2 # goal (~2.4GiB for a 32-bit COO) + + if isinstance(arr, sparse.csr_matrix) or isinstance(arr, sparse.csr_array): + avg_nnz_per_row = arr.nnz // arr.shape[0] + row_chunk_size = max(1, round(nnz_chunk_size / avg_nnz_per_row)) + for row_idx in range(0, arr.shape[0], row_chunk_size): + slc = arr[row_idx : row_idx + row_chunk_size, :].tocoo() + slc.resize(arr.shape) + slc.row += row_idx + yield slc + return + + if isinstance(arr, sparse.csc_matrix) or isinstance(arr, sparse.csc_array): + avg_nnz_per_col = arr.nnz // arr.shape[1] + col_chunk_size = max(1, round(nnz_chunk_size / avg_nnz_per_col)) + for col_idx in range(0, arr.shape[1], col_chunk_size): + slc = arr[:, col_idx : col_idx + col_chunk_size].tocoo() + slc.resize(arr.shape) + slc.col += col_idx + yield slc + return + + if isinstance(arr, np.ndarray): + row_chunk_size = max(1, nnz_chunk_size // arr.shape[1]) + for row_idx in range(0, arr.shape[0], row_chunk_size): + slc = sparse.coo_matrix(arr[row_idx : row_idx + row_chunk_size, :]) + slc.resize(arr.shape) + slc.row += row_idx + yield slc + return + + raise NotImplementedError("array_chunker: unsupported array type") + + +def uricat(container_uri: str, *paths: str) -> str: + """ + Concat one or more paths, separated with '/' + + Similar to urllib.parse.urljoin except it takes an iterator, and + assumes the container_uri is a 'directory'/container, ie, ends in '/'. + """ + + uri = container_uri + for p in paths: + uri = uri if uri.endswith("/") else uri + "/" + uri = urllib.parse.urljoin(uri, p) + return uri + + +def fetch_json(url: str) -> object: + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def is_positive_integral(X: Union[npt.NDArray[np.floating[Any]], sparse.spmatrix]) -> bool: + """ + Return true if the matrix/array contains only positive integral values, + False otherwise. + """ + data = X if isinstance(X, np.ndarray) else X.data + + if np.signbit(data).any(): + return False + elif np.any(~np.equal(np.mod(data, 1), 0)): + return False + else: + return True + + +def pandas_dataframe_strings_to_ascii_issue_247_workaround(df: pd.DataFrame) -> pd.DataFrame: + """ + TileDB-SOMA _temporarily_ supports only ASCII in strings. + This code will convert all strings in a dataframe to ascii. + Remove this code when TileDB-SOMA#274 is resolved. + """ + import unicodedata + + warn("Converting dataframe strings to ASCII as temporary work-around for TileDB-SOMA#274.") + for k in df: + if df[k].dtype == object: + df[k] = df[k].map(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode()) + + return df + + +def anndata_ordered_bool_issue_853_workaround(df: pd.DataFrame) -> pd.DataFrame: + # """ + # TileDB-SOMA does not support creating dataframe with categorical / dictionary + # column types. + # """ + # copied = False + # for k in df.keys(): + # if pd.api.types.is_categorical_dtype(df[k]): + # if not copied: + # df = df.copy() + # copied = True + + # df[k] = df[k].astype(df[k].cat.categories.dtype) + + # AnnData has a bug (https://github.com/scverse/anndata/issues/853) which will + # cause Pandas CategoricalDtype `ordered` to be a numpy.bool_, rather than a bool. + # This causes Arrow to blow up. + copied = False + for k in df.keys(): + if pd.api.types.is_categorical_dtype(df[k]) and type(df[k].cat.ordered) == np.bool_: + if not copied: + df = df.copy() + copied = True + + df[k] = df[k].cat.set_categories(df[k].cat.categories, ordered=bool(df[k].cat.ordered)) + + return df diff --git a/cell_census_builder/validate.py b/cell_census_builder/validate.py new file mode 100644 index 000000000..d1d18ad0a --- /dev/null +++ b/cell_census_builder/validate.py @@ -0,0 +1,374 @@ +import argparse +import concurrent.futures +import dataclasses +import logging +import os.path +import pathlib +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Tuple, Union, cast + +import numpy as np +import numpy.typing as npt +import pyarrow as pa +import tiledbsoma as soma +from scipy import sparse + +from .anndata import make_anndata_cell_filter, open_anndata +from .datasets import Dataset +from .experiment_builder import ExperimentBuilder +from .globals import ( + CENSUS_DATASETS_COLUMNS, + CENSUS_DATASETS_NAME, + CENSUS_OBS_TERM_COLUMNS, + CENSUS_SCHEMA_VERSION, + CENSUS_SUMMARY_CELL_COUNTS_COLUMNS, + CENSUS_SUMMARY_CELL_COUNTS_NAME, + CENSUS_SUMMARY_NAME, + CENSUS_VAR_TERM_COLUMNS, + CXG_OBS_TERM_COLUMNS, + CXG_SCHEMA_VERSION, + X_LAYERS, + TileDB_Ctx, +) +from .mp import create_process_pool_executor +from .util import uricat + + +@dataclass +class EbInfo: + """Class used to collect information about axis (for validation code)""" + + n_obs: int = 0 + vars: set[str] = dataclasses.field(default_factory=set) + dataset_ids: set[str] = dataclasses.field(default_factory=set) + + def update(self: "EbInfo", b: "EbInfo") -> "EbInfo": + self.n_obs += b.n_obs + self.vars |= b.vars + self.dataset_ids |= b.dataset_ids + return self + + +def validate_all_soma_objects_exist(soma_path: str, experiment_builders: List[ExperimentBuilder]) -> bool: + """ + Validate all objects present and contain expected metadata. + + soma_path + +-- census_info + | +-- summary: soma.DataFrame + | +-- datasets: soma.DataFrame + | +-- summary_cell_counts: soma.DataFrame + +-- census_data + | +-- homo_sapiens: soma.Experiment + | +-- mus_musculus: soma.Experiment + """ + census = soma.Collection(soma_path, ctx=TileDB_Ctx()) + assert census.exists() and census.soma_type == "SOMACollection" + assert "cxg_schema_version" in census.metadata and census.metadata["cxg_schema_version"] == CXG_SCHEMA_VERSION + assert ( + "census_schema_version" in census.metadata and census.metadata["census_schema_version"] == CENSUS_SCHEMA_VERSION + ) + assert "created_on" in census.metadata and datetime.fromisoformat(census.metadata["created_on"]) + + for name in ["census_info", "census_data"]: + assert name in census + assert census[name].soma_type == "SOMACollection" + assert census[name].exists() + + census_info = census["census_info"] + for name in [CENSUS_DATASETS_NAME, CENSUS_SUMMARY_NAME, CENSUS_SUMMARY_CELL_COUNTS_NAME]: + assert name in census_info, f"`{name}` missing from census_info" + assert census_info[name].soma_type == "SOMADataFrame" + assert census_info[name].exists() + + assert sorted(census_info[CENSUS_DATASETS_NAME].keys()) == sorted(CENSUS_DATASETS_COLUMNS + ["soma_joinid"]) + assert sorted(census_info[CENSUS_SUMMARY_CELL_COUNTS_NAME].keys()) == sorted( + list(CENSUS_SUMMARY_CELL_COUNTS_COLUMNS) + ["soma_joinid"] + ) + assert sorted(census_info[CENSUS_SUMMARY_NAME].keys()) == sorted(["label", "value", "soma_joinid"]) + + # there should be an experiment for each builder + census_data = census["census_data"] + for eb in experiment_builders: + assert ( + eb.name in census_data + and census_data[eb.name].exists() + and census_data[eb.name].soma_type == "SOMAExperiment" + ) + + e = census_data[eb.name] + assert "obs" in e and e.obs.exists() and e.obs.soma_type == "SOMADataFrame" + assert "ms" in e and e.ms.exists() and e.ms.soma_type == "SOMACollection" + + # there should be a single measurement called 'RNA' + assert "RNA" in e.ms and e.ms["RNA"].exists() and e.ms["RNA"].soma_type == "SOMAMeasurement" + + # The measurement should contain all X layers where n_obs > 0 (existence checked elsewhere) + rna = e.ms["RNA"] + assert "var" in rna and rna["var"].exists() and rna["var"].soma_type == "SOMADataFrame" + assert "X" in rna and rna["X"].exists() and rna["X"].soma_type == "SOMACollection" + for lyr in X_LAYERS: + # layers only exist if there are cells in the measurement + if lyr in rna.X: + assert rna.X[lyr].exists() and rna.X[lyr].soma_type == "SOMASparseNdArray" + + # and a presence matrix + assert "varp" in rna and rna["varp"].exists() and rna["varp"].soma_type == "SOMACollection" + # dataset presence only exists if there are cells in the measurement + if "dataset_presence_matrix" in rna.varp: + assert rna.varp["dataset_presence_matrix"].exists() + assert rna.varp["dataset_presence_matrix"].soma_type == "SOMASparseNdArray" + + return True + + +def _validate_axis_dataframes(args: Tuple[str, str, Dataset, List[ExperimentBuilder]]) -> Dict[str, EbInfo]: + assets_path, soma_path, dataset, experiment_builders = args + census = soma.Collection(soma_path, ctx=TileDB_Ctx()) + census_data = census["census_data"] + dataset_id = dataset.dataset_id + _, unfiltered_ad = next(open_anndata(assets_path, [dataset], backed="r")) + eb_info: Dict[str, EbInfo] = {} + for eb in experiment_builders: + eb_info[eb.name] = EbInfo() + anndata_cell_filter = make_anndata_cell_filter(eb.anndata_cell_filter_spec) + se = census_data[eb.name] + ad = anndata_cell_filter(unfiltered_ad, retain_X=False) + dataset_obs = ( + se.obs.read_as_pandas_all( + column_names=list(CENSUS_OBS_TERM_COLUMNS), + value_filter=f"dataset_id == '{dataset_id}'", + ) + .drop(columns=["dataset_id", "tissue_general", "tissue_general_ontology_term_id"]) + .sort_values(by="soma_joinid") + .drop(columns=["soma_joinid"]) + .reset_index(drop=True) + ) + + assert len(dataset_obs) == len(ad.obs), f"{dataset.dataset_id}/{eb.name} obs length mismatch" + if ad.n_obs > 0: + eb_info[eb.name].n_obs += ad.n_obs + eb_info[eb.name].dataset_ids.add(dataset_id) + eb_info[eb.name].vars |= set(ad.var.index.array) + ad_obs = ad.obs[list(CXG_OBS_TERM_COLUMNS)].reset_index(drop=True) + assert (dataset_obs == ad_obs).all().all(), f"{dataset.dataset_id}/{eb.name} obs content, mismatch" + + return eb_info + + +def validate_axis_dataframes( + assets_path: str, + soma_path: str, + datasets: List[Dataset], + experiment_builders: List[ExperimentBuilder], + args: argparse.Namespace, +) -> bool: + """ " + Validate axis dataframes: schema, shape, contents + + Raises on error. Returns True on success. + """ + logging.debug("validate_axis_dataframes") + census = soma.Collection(soma_path, ctx=TileDB_Ctx()) + census_data = census["census_data"] + + # check schema + expected_obs_columns = CENSUS_OBS_TERM_COLUMNS + expected_var_columns = CENSUS_VAR_TERM_COLUMNS + for eb in experiment_builders: + obs = census_data[eb.name].obs + var = census_data[eb.name].ms["RNA"].var + assert sorted(obs.keys()) == sorted(expected_obs_columns.keys()) + assert sorted(var.keys()) == sorted(expected_var_columns.keys()) + for field in obs.schema: + assert field.name in expected_obs_columns + assert field.type == expected_obs_columns[field.name], f"Unexpected type in {field.name}: {field.type}" + for field in var.schema: + assert field.name in expected_var_columns + assert field.type == expected_var_columns[field.name], f"Unexpected type in {field.name}: {field.type}" + + # check shapes & perform weak test of contents + eb_info = {eb.name: EbInfo() for eb in experiment_builders} + if args.multi_process: + with create_process_pool_executor(args) as ppe: + futures = [ + ppe.submit(_validate_axis_dataframes, (assets_path, soma_path, dataset, experiment_builders)) + for dataset in datasets + ] + for n, future in enumerate(concurrent.futures.as_completed(futures), start=1): + res = future.result() + for eb_name, ebi in res.items(): + eb_info[eb_name].update(ebi) + logging.info(f"validate_axis {n} of {len(datasets)} complete.") + else: + for n, dataset in enumerate(datasets, start=1): + for eb_name, ebi in _validate_axis_dataframes( + (assets_path, soma_path, dataset, experiment_builders) + ).items(): + eb_info[eb_name].update(ebi) + logging.info(f"validate_axis {n} of {len(datasets)} complete.") + + for eb in experiment_builders: + se = census_data[eb.name] + n_vars = len(eb_info[eb.name].vars) + + census_obs_df = se.obs.read_as_pandas_all(column_names=["soma_joinid", "dataset_id"]) + assert eb_info[eb.name].n_obs == len(census_obs_df) + assert (len(census_obs_df) == 0) or (census_obs_df.soma_joinid.max() + 1 == eb_info[eb.name].n_obs) + assert eb_info[eb.name].dataset_ids == set(census_obs_df.dataset_id.unique()) + + census_var_df = se.ms["RNA"].var.read_as_pandas_all(column_names=["feature_id", "soma_joinid"]) + assert n_vars == len(census_var_df) + assert eb_info[eb.name].vars == set(census_var_df.feature_id.array) + assert (len(census_var_df) == 0) or (census_var_df.soma_joinid.max() + 1 == n_vars) + + return True + + +def _validate_X_layers_contents(args: Tuple[str, str, Dataset, List[ExperimentBuilder]]) -> bool: + """ + Validate that a single dataset is correctly represented in the census. + Intended to be dispatched from validate_X_layers. + + Currently implements a weak test: that nnz is correct. + """ + assets_path, soma_path, dataset, experiment_builders = args + census = soma.Collection(soma_path, ctx=TileDB_Ctx()) + census_data = census["census_data"] + _, unfiltered_ad = next(open_anndata(assets_path, [dataset])) + for eb in experiment_builders: + se = census_data[eb.name] + anndata_cell_filter = make_anndata_cell_filter(eb.anndata_cell_filter_spec) + ad = anndata_cell_filter(unfiltered_ad, retain_X=True) + + soma_joinids: npt.NDArray[np.int64] = se.obs.read_as_pandas_all( + column_names=["soma_joinid", "dataset_id"], value_filter=f"dataset_id == '{dataset.dataset_id}'" + ).soma_joinid.to_numpy() + + raw_nnz = 0 + if len(soma_joinids) > 0: + assert "raw" in se.ms["RNA"].X and se.ms["RNA"].X["raw"].exists() + + def count_elements(arr: soma.SparseNdArray, join_ids: npt.NDArray[np.int64]) -> int: + # TODO XXX: Work-around for regression TileDB-SOMA#473 + # return sum(t.non_zero_length for t in arr.read_sparse_tensor((join_ids, slice(None)))) + return sum(t.non_zero_length for t in arr.read_sparse_tensor((pa.array(join_ids), slice(None)))) + + raw_nnz = count_elements(se.ms["RNA"].X["raw"], soma_joinids) + + def nnz(arr: Union[sparse.spmatrix, npt.NDArray[Any]]) -> int: + if isinstance(arr, (sparse.spmatrix, sparse.coo_array, sparse.csr_array, sparse.csc_array)): + return cast(int, arr.nnz) + return np.count_nonzero(arr) + + if ad.raw is None: + assert raw_nnz == nnz(ad.X), f"{eb.name}:{dataset.dataset_id} 'raw' nnz mismatch {raw_nnz} vs {nnz(ad.X)}" + else: + assert raw_nnz == nnz( + ad.raw.X + ), f"{eb.name}:{dataset.dataset_id} 'raw' nnz mismatch {raw_nnz} vs {nnz(ad.raw.X)}" + + return True + + +def validate_X_layers( + assets_path: str, + soma_path: str, + datasets: List[Dataset], + experiment_builders: List[ExperimentBuilder], + args: argparse.Namespace, +) -> bool: + """ " + Validate all X layers: schema, shape, contents + + Raises on error. Returns True on success. + """ + logging.debug("validate_X_layers") + census = soma.Collection(soma_path, ctx=TileDB_Ctx()) + census_data = census["census_data"] + + for eb in experiment_builders: + se = census_data[eb.name] + assert se.ms["RNA"].X.exists() + + census_obs_df = se.obs.read_as_pandas_all(column_names=["soma_joinid"]) + n_obs = len(census_obs_df) + census_var_df = se.ms["RNA"].var.read_as_pandas_all(column_names=["feature_id", "soma_joinid"]) + n_vars = len(census_var_df) + + if n_obs > 0: + for lyr in X_LAYERS: + assert se.ms["RNA"].X[lyr].exists() + X = se.ms["RNA"].X[lyr] + assert X.schema.field("soma_dim_0").type == pa.int64() + assert X.schema.field("soma_dim_1").type == pa.int64() + assert X.schema.field("soma_data").type == pa.float32() + assert X.shape == (n_obs, n_vars) + + if args.multi_process: + with create_process_pool_executor(args) as ppe: + futures = [ + ppe.submit(_validate_X_layers_contents, (assets_path, soma_path, dataset, experiment_builders)) + for dataset in datasets + ] + for n, future in enumerate(concurrent.futures.as_completed(futures), start=1): + assert future.result() + logging.info(f"validate_X {n} of {len(datasets)} complete.") + else: + for n, vld in enumerate( + ( + _validate_X_layers_contents((assets_path, soma_path, dataset, experiment_builders)) + for dataset in datasets + ), + start=1, + ): + logging.info(f"validate_X {n} of {len(datasets)} complete.") + assert vld + + return True + + +def load_datasets_from_census(assets_path: str, soma_path: str) -> List[Dataset]: + # Datasets are pulled from the census datasets manifest, validating the SOMA + # census against the snapshot assets. + df = soma.Collection(soma_path)["census_info"][CENSUS_DATASETS_NAME].read_as_pandas_all() + df.drop(columns=["soma_joinid"], inplace=True) + df["corpora_asset_h5ad_uri"] = df.dataset_h5ad_path.map(lambda p: uricat(assets_path, p)) + datasets = Dataset.from_dataframe(df) + return datasets + + +def validate_manifest_contents(assets_path: str, datasets: List[Dataset]) -> bool: + """Confirm contents of manifest are correct.""" + for d in datasets: + p = pathlib.Path(uricat(assets_path, d.dataset_h5ad_path)) + assert p.exists() and p.is_file(), f"{d.dataset_h5ad_path} is missing from the census" + assert str(p).endswith(".h5ad"), "Expected only H5AD assets" + + return True + + +def validate(args: argparse.Namespace, experiment_builders: List[ExperimentBuilder]) -> bool: + """ + Validate that the "census" matches the datasets and experiment builder spec. + + Will raise if validation fails. Returns True on success. + """ + logging.info("Validation start") + + base_path = uricat(args.uri, args.build_tag) + soma_path = uricat(base_path, "soma") + assets_path = uricat(base_path, "h5ads") + + assert os.path.exists(soma_path) and os.path.exists(assets_path) + + assert validate_all_soma_objects_exist(soma_path, experiment_builders) + + datasets = load_datasets_from_census(assets_path, soma_path) + assert validate_manifest_contents(assets_path, datasets) + + assert validate_axis_dataframes(assets_path, soma_path, datasets, experiment_builders, args) + assert validate_X_layers(assets_path, soma_path, datasets, experiment_builders, args) + logging.info("Validation success") + return True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..5bc9af63d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,14 @@ +[tool.black] +line-length = 120 +target_version = ['py39'] + +[tool.isort] +profile="black" +line_length = 120 + +[tool.mypy] +show_error_codes = true +ignore_missing_imports = true +warn_unreachable = true +strict = true +plugins = "numpy.typing.mypy_plugin"