initial commit of migrated source (#2)

* initial commit of migrated source * add notebooks * fix readme * pr review changes * add missing semver
chanzuckerberg · Nov 30, 2022 · aa2277e · aa2277e
1 parent c21177c
commit aa2277e
Show file tree

Hide file tree

Showing 51 changed files with 7,282 additions and 1 deletion.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,13 @@
+[flake8]
+max-line-length = 120
+extend-ignore =
+    # See https://github.com/PyCQA/pycodestyle/issues/373
+    E203,
+    E402
+exclude =
+    .git/
+    venv/
+    tmp/
+    .ipynb_checkpoints/
+    __pycache__
+
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -0,0 +1,35 @@
+name: Python Linting
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: .
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.9"
+      - run: pip install -U pip
+      - name: Python Black
+        run: |
+          python -m pip install black black[jupyter]
+          python -m black --check --diff .
+      - name: Python isort
+        run: |
+          python -m pip install isort
+          python -m isort --check --diff .
+      - name: Python style with flake8[bugbear]
+        run: |
+          python -m pip install flake8-bugbear
+          python -m flake8 .
+      - name: Check type annotations mypy
+        run: |
+          python -m pip install mypy==0.982 types-setuptools types-requests numpy
+          python -m mypy .
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# common user tmp directories
+tmp
+temp
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # CELLxGENE Cell Census
 
-This repository contains documentation and example code related to the Chan Zuckerberg CELLxGENE Cell Census.
+**Status**: Unstable, under rapid development
+
+This repository contains documentation and example code related to the Chan Zuckerberg CELLxGENE Cell Census, and a client (API) package to simplify accessing the Cell Census data.
 
 The CZ Cell Census is an aggregation of all public single cell data available in [CELLxGENE Discover](https://cellxgene.cziscience.com/), published in API-accessible formats, including the [SOMA API](https://github.com/single-cell-data/).
 

diff --git a/api/python/cell_census/LICENSE b/api/python/cell_census/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Chan Zuckerberg Initiative
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/api/python/cell_census/REAMDE.md b/api/python/cell_census/REAMDE.md
@@ -0,0 +1,9 @@
+The `cell_census` package provides an API to facilitate use of the CZI Science Cell Census.
+
+**Status**: Unstable, under rapid development
+
+For more information, see the [cell_census repo](https://github.com/chanzuckerberg/cell-census/)### For More Help
+
+For more help, please file a issue on the repo, or contact us at <[email protected]>
+
+If you believe you have found a security issue, we would appreciate notification. Please send email to <[email protected]>.
diff --git a/api/python/cell_census/setup.cfg b/api/python/cell_census/setup.cfg
@@ -0,0 +1,27 @@
+[metadata]
+name = cell_census
+version = attr: cell_census.__version__
+author = Chan Zuckerberg Initiative
+author_email = [email protected]
+description = API to simplify use of the CZI Science CELLxGENE Cell Census
+long_description = file: README.md LICENSE
+license = MIT
+url = https://github.com/chanzuckerberg/cell-census
+
+[options]
+python_requires = >= 3.8
+install_requires =
+    numba
+    numpy
+    requests
+    tiledb
+    tiledbsoma
+    typing_extensions
+    s3fs
+    scikit-misc
+package_dir=
+    =src
+packages=find:
+
+[options.packages.find]
+where=src
diff --git a/api/python/cell_census/setup.py b/api/python/cell_census/setup.py
@@ -0,0 +1,4 @@
+from setuptools import setup
+
+if __name__ == "__main__":
+    setup()
diff --git a/api/python/cell_census/src/cell_census/__init__.py b/api/python/cell_census/src/cell_census/__init__.py
@@ -0,0 +1,14 @@
+from .get_anndata import get_anndata
+from .open import download_source_h5ad, get_source_h5ad_uri, open_soma
+from .release_directory import get_directory, get_release_description
+
+__version__ = "0.0.1-dev0"
+
+__all__ = [
+    "download_source_h5ad",
+    "get_anndata",
+    "get_directory",
+    "get_source_h5ad_uri",
+    "get_release_description",
+    "open_soma",
+]
diff --git a/api/python/cell_census/src/cell_census/compute/__init__.py b/api/python/cell_census/src/cell_census/compute/__init__.py
@@ -0,0 +1,7 @@
+from .highly_variable_genes import highly_variable_genes
+from .meanvar import OnlineMatrixMeanVariance
+
+__all__ = [
+    "highly_variable_genes",
+    "OnlineMatrixMeanVariance",
+]
diff --git a/api/python/cell_census/src/cell_census/compute/highly_variable_genes.py b/api/python/cell_census/src/cell_census/compute/highly_variable_genes.py
@@ -0,0 +1,94 @@
+import numpy as np
+import pandas as pd
+
+from ..experiment_query import ExperimentQuery
+from .meanvar import OnlineMatrixMeanVariance
+
+
+def highly_variable_genes(query: ExperimentQuery, n_top_genes: int = 10) -> pd.DataFrame:
+    """
+    Acknowledgements: scanpy highly variable genes implementation, github.com/scverse/scanpy
+    """
+    use_prefetch = True
+
+    try:
+        import skmisc.loess
+    except ImportError:
+        raise ImportError("Please install skmisc package via `pip install --user scikit-misc")
+
+    indexer = query.get_indexer()
+    mvn = OnlineMatrixMeanVariance(query.n_obs, query.n_vars)
+    for arrow_tbl in query.X("raw", prefetch=use_prefetch):
+        var_dim = indexer.var_index(arrow_tbl["soma_dim_1"])
+        data = arrow_tbl["soma_data"].to_numpy()
+        mvn.update(var_dim, data)
+
+    u, v = mvn.finalize()
+    var_df = pd.DataFrame(
+        index=pd.Index(data=query.var_joinids(), name="soma_joinid"),
+        data={
+            "means": u,
+            "variances": v,
+        },
+    )
+
+    estimated_variances = np.zeros((len(var_df),), dtype=np.float64)
+    not_const = v > 0
+    y = np.log10(v[not_const])
+    x = np.log10(u[not_const])
+    model = skmisc.loess.loess(x, y, span=0.3, degree=2)
+    model.fit()
+    estimated_variances[not_const] = model.outputs.fitted_values
+    reg_std = np.sqrt(10**estimated_variances)
+
+    # A second pass over the data is required because the clip value
+    # is determined by the first pass
+    N = query.n_obs
+    vmax = np.sqrt(N)
+    clip_val = reg_std * vmax + u
+    counts_sum = np.zeros((query.n_vars,), dtype=np.float64)  # clipped
+    squared_counts_sum = np.zeros((query.n_vars,), dtype=np.float64)  # clipped
+    for arrow_tbl in query.X("raw", prefetch=use_prefetch):
+        var_dim = indexer.var_index(arrow_tbl["soma_dim_1"])
+        data = arrow_tbl["soma_data"].to_numpy()
+        # clip
+        mask = data > clip_val[var_dim]
+        data = data.copy()
+        data[mask] = clip_val[var_dim[mask]]
+        np.add.at(counts_sum, var_dim, data)
+        np.add.at(squared_counts_sum, var_dim, data**2)
+
+    norm_gene_vars = (1 / ((N - 1) * np.square(reg_std))) * (
+        (N * np.square(u)) + squared_counts_sum - 2 * counts_sum * u
+    )
+    norm_gene_vars = norm_gene_vars.reshape(1, -1)
+
+    # argsort twice gives ranks, small rank means most variable
+    ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1), axis=1)
+
+    # this is done in SelectIntegrationFeatures() in Seurat v3
+    ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32)
+    num_batches_high_var = np.sum((ranked_norm_gene_vars < n_top_genes).astype(int), axis=0)
+    ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan
+    ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars)  # type: ignore
+    median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan)  # type: ignore
+
+    var_df = var_df.assign(
+        highly_variable_nbatches=pd.Series(num_batches_high_var, index=var_df.index),
+        highly_variable_rank=pd.Series(median_ranked, index=var_df.index),
+        variances_norm=pd.Series(np.mean(norm_gene_vars, axis=0), index=var_df.index),
+    )
+
+    sorted_index = (
+        var_df[["highly_variable_rank", "highly_variable_nbatches"]]
+        .sort_values(
+            ["highly_variable_rank", "highly_variable_nbatches"],
+            ascending=[True, False],
+            na_position="last",
+        )
+        .index
+    )
+    var_df["highly_variable"] = False
+    var_df = var_df.drop(columns=["highly_variable_nbatches"])
+    var_df.loc[sorted_index[: int(n_top_genes)], "highly_variable"] = True
+    return var_df
diff --git a/api/python/cell_census/src/cell_census/compute/meanvar.py b/api/python/cell_census/src/cell_census/compute/meanvar.py
@@ -0,0 +1,76 @@
+import numba
+import numpy as np
+import numpy.typing as npt
+
+
+class OnlineMatrixMeanVariance:
+    n_samples: int
+    n_variables: int
+
+    def __init__(self, n_samples: int, n_variables: int):
+        """
+        Compute mean and variance for n_variables over n_samples, encoded
+        in a COO format. Equivalent to:
+            numpy.mean(data, axis=0)
+            numpy.var(data, axix=0)
+        where the input `data` is of shape (n_samples, n_variables)
+        """
+        self.n_samples = n_samples
+        self.n_variables = n_variables
+
+        self.n_a = np.zeros((n_variables,), dtype=np.int32)
+        self.u_a = np.zeros((n_variables,), dtype=np.float64)
+        self.M2_a = np.zeros((n_variables,), dtype=np.float64)
+
+    def update(self, coord_vec: npt.NDArray[np.int64], value_vec: npt.NDArray[np.float32]) -> None:
+        _mean_variance_update(coord_vec, value_vec, self.n_a, self.u_a, self.M2_a)
+
+    def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
+        """
+        Returns tuple containing mean and variance
+        """
+        u, M2 = _mean_variance_finalize(self.n_samples, self.n_a, self.u_a, self.M2_a)
+
+        # compute sample variance
+        var = M2 / max(1, (self.n_samples - 1))
+
+        return u, var
+
+
+# TODO: add type signatures to annotation, removing need to do dynamic generation
+
+
+@numba.jit(nopython=True, nogil=True)  # type: ignore[misc]  # See https://github.com/numba/numba/issues/7424
+def _mean_variance_update(
+    col_arr: npt.NDArray[np.int64],
+    val_arr: npt.NDArray[np.float32],
+    n: npt.NDArray[np.int32],
+    u: npt.NDArray[np.float64],
+    M2: npt.NDArray[np.float64],
+) -> None:
+    """
+    Incrementally accumulate mean and sum of square of distance from mean using
+    Welford's online method.
+    """
+    for col, val in zip(col_arr, val_arr):
+        u_prev = u[col]
+        M2_prev = M2[col]
+        n[col] += 1
+        u[col] = u_prev + (val - u_prev) / n[col]
+        M2[col] = M2_prev + (val - u_prev) * (val - u[col])
+
+
+@numba.jit(nopython=True, nogil=True)  # type: ignore[misc]  # See https://github.com/numba/numba/issues/7424
+def _mean_variance_finalize(
+    n_samples: int, n_a: npt.NDArray[np.int32], u_a: npt.NDArray[np.float64], M2_a: npt.NDArray[np.float64]
+) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
+    """
+    Finalize incremental values, acconting for missing elements (due to sparse input).
+    Non-sparse and sparse combined using Chan's parallel adaptation of Welford's.
+    The code assumes the sparse elements are all zero and ignores those terms.
+    """
+    n_b = n_samples - n_a
+    delta = -u_a  # assumes u_b == 0
+    u = (n_a * u_a) / n_samples
+    M2 = M2_a + delta**2 * n_a * n_b / n_samples  # assumes M2_b == 0
+    return u, M2
diff --git a/api/python/cell_census/src/cell_census/experiment_query/__init__.py b/api/python/cell_census/src/cell_census/experiment_query/__init__.py
@@ -0,0 +1,12 @@
+from .axis import AxisQuery
+from .query import ExperimentQuery, experiment_query
+from .types import AxisColumnNames
+from .util import X_as_series
+
+__all__ = [
+    "experiment_query",
+    "AxisColumnNames",
+    "AxisQuery",
+    "ExperimentQuery",
+    "X_as_series",
+]
diff --git a/api/python/cell_census/src/cell_census/experiment_query/anndata.py b/api/python/cell_census/src/cell_census/experiment_query/anndata.py
@@ -0,0 +1,35 @@
+from typing import Dict, Tuple
+
+import anndata
+import pyarrow as pa
+import scipy.sparse as sparse
+
+from .types import ExperimentQueryReadArrowResult
+
+
+def arrow_to_scipy_csr(X: pa.Table, shape: Tuple[int, int]) -> sparse.csr_matrix:
+    return sparse.csr_matrix((X["soma_data"].to_numpy(), (X["_dim_0"].to_numpy(), X["_dim_1"].to_numpy())), shape=shape)
+
+
+def make_anndata(query_result: ExperimentQueryReadArrowResult) -> anndata.AnnData:
+
+    obs = query_result["obs"]
+    obs = obs.to_pandas()
+    obs.index = obs.index.map(str)
+
+    var = query_result["var"]
+    var = var.to_pandas()
+    var.index = var.index.map(str)
+
+    shape = (len(obs), len(var))
+
+    X = query_result.get("X", None)
+    if X is not None:
+        X = arrow_to_scipy_csr(X, shape)
+
+    X_layers = query_result.get("X_layers", {})
+    layers: Dict[str, sparse.csr_matrix] = {}
+    for X_layer_name, X_layer_table in X_layers.items():
+        layers[X_layer_name] = arrow_to_scipy_csr(X_layer_table, shape)
+
+    return anndata.AnnData(X=X, obs=obs, var=var, layers=(layers if len(layers) else None))
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,7 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # common user tmp directories
+    tmp
+    temp