-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
initial commit of migrated source (#2)
* initial commit of migrated source * add notebooks * fix readme * pr review changes * add missing semver
- Loading branch information
Bruce Martin
authored
Nov 30, 2022
1 parent
c21177c
commit aa2277e
Showing
51 changed files
with
7,282 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[flake8] | ||
max-line-length = 120 | ||
extend-ignore = | ||
# See https://github.com/PyCQA/pycodestyle/issues/373 | ||
E203, | ||
E402 | ||
exclude = | ||
.git/ | ||
venv/ | ||
tmp/ | ||
.ipynb_checkpoints/ | ||
__pycache__ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: Python Linting | ||
|
||
on: | ||
pull_request: | ||
push: | ||
branches: [main] | ||
|
||
jobs: | ||
lint: | ||
runs-on: ubuntu-latest | ||
defaults: | ||
run: | ||
working-directory: . | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.9" | ||
- run: pip install -U pip | ||
- name: Python Black | ||
run: | | ||
python -m pip install black black[jupyter] | ||
python -m black --check --diff . | ||
- name: Python isort | ||
run: | | ||
python -m pip install isort | ||
python -m isort --check --diff . | ||
- name: Python style with flake8[bugbear] | ||
run: | | ||
python -m pip install flake8-bugbear | ||
python -m flake8 . | ||
- name: Check type annotations mypy | ||
run: | | ||
python -m pip install mypy==0.982 types-setuptools types-requests numpy | ||
python -m mypy . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,7 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# common user tmp directories | ||
tmp | ||
temp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2022 Chan Zuckerberg Initiative | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
The `cell_census` package provides an API to facilitate use of the CZI Science Cell Census. | ||
|
||
**Status**: Unstable, under rapid development | ||
|
||
For more information, see the [cell_census repo](https://github.com/chanzuckerberg/cell-census/)### For More Help | ||
|
||
For more help, please file a issue on the repo, or contact us at <[email protected]> | ||
|
||
If you believe you have found a security issue, we would appreciate notification. Please send email to <[email protected]>. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
[metadata] | ||
name = cell_census | ||
version = attr: cell_census.__version__ | ||
author = Chan Zuckerberg Initiative | ||
author_email = [email protected] | ||
description = API to simplify use of the CZI Science CELLxGENE Cell Census | ||
long_description = file: README.md LICENSE | ||
license = MIT | ||
url = https://github.com/chanzuckerberg/cell-census | ||
|
||
[options] | ||
python_requires = >= 3.8 | ||
install_requires = | ||
numba | ||
numpy | ||
requests | ||
tiledb | ||
tiledbsoma | ||
typing_extensions | ||
s3fs | ||
scikit-misc | ||
package_dir= | ||
=src | ||
packages=find: | ||
|
||
[options.packages.find] | ||
where=src |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from setuptools import setup | ||
|
||
if __name__ == "__main__": | ||
setup() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from .get_anndata import get_anndata | ||
from .open import download_source_h5ad, get_source_h5ad_uri, open_soma | ||
from .release_directory import get_directory, get_release_description | ||
|
||
__version__ = "0.0.1-dev0" | ||
|
||
__all__ = [ | ||
"download_source_h5ad", | ||
"get_anndata", | ||
"get_directory", | ||
"get_source_h5ad_uri", | ||
"get_release_description", | ||
"open_soma", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .highly_variable_genes import highly_variable_genes | ||
from .meanvar import OnlineMatrixMeanVariance | ||
|
||
__all__ = [ | ||
"highly_variable_genes", | ||
"OnlineMatrixMeanVariance", | ||
] |
94 changes: 94 additions & 0 deletions
94
api/python/cell_census/src/cell_census/compute/highly_variable_genes.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from ..experiment_query import ExperimentQuery | ||
from .meanvar import OnlineMatrixMeanVariance | ||
|
||
|
||
def highly_variable_genes(query: ExperimentQuery, n_top_genes: int = 10) -> pd.DataFrame: | ||
""" | ||
Acknowledgements: scanpy highly variable genes implementation, github.com/scverse/scanpy | ||
""" | ||
use_prefetch = True | ||
|
||
try: | ||
import skmisc.loess | ||
except ImportError: | ||
raise ImportError("Please install skmisc package via `pip install --user scikit-misc") | ||
|
||
indexer = query.get_indexer() | ||
mvn = OnlineMatrixMeanVariance(query.n_obs, query.n_vars) | ||
for arrow_tbl in query.X("raw", prefetch=use_prefetch): | ||
var_dim = indexer.var_index(arrow_tbl["soma_dim_1"]) | ||
data = arrow_tbl["soma_data"].to_numpy() | ||
mvn.update(var_dim, data) | ||
|
||
u, v = mvn.finalize() | ||
var_df = pd.DataFrame( | ||
index=pd.Index(data=query.var_joinids(), name="soma_joinid"), | ||
data={ | ||
"means": u, | ||
"variances": v, | ||
}, | ||
) | ||
|
||
estimated_variances = np.zeros((len(var_df),), dtype=np.float64) | ||
not_const = v > 0 | ||
y = np.log10(v[not_const]) | ||
x = np.log10(u[not_const]) | ||
model = skmisc.loess.loess(x, y, span=0.3, degree=2) | ||
model.fit() | ||
estimated_variances[not_const] = model.outputs.fitted_values | ||
reg_std = np.sqrt(10**estimated_variances) | ||
|
||
# A second pass over the data is required because the clip value | ||
# is determined by the first pass | ||
N = query.n_obs | ||
vmax = np.sqrt(N) | ||
clip_val = reg_std * vmax + u | ||
counts_sum = np.zeros((query.n_vars,), dtype=np.float64) # clipped | ||
squared_counts_sum = np.zeros((query.n_vars,), dtype=np.float64) # clipped | ||
for arrow_tbl in query.X("raw", prefetch=use_prefetch): | ||
var_dim = indexer.var_index(arrow_tbl["soma_dim_1"]) | ||
data = arrow_tbl["soma_data"].to_numpy() | ||
# clip | ||
mask = data > clip_val[var_dim] | ||
data = data.copy() | ||
data[mask] = clip_val[var_dim[mask]] | ||
np.add.at(counts_sum, var_dim, data) | ||
np.add.at(squared_counts_sum, var_dim, data**2) | ||
|
||
norm_gene_vars = (1 / ((N - 1) * np.square(reg_std))) * ( | ||
(N * np.square(u)) + squared_counts_sum - 2 * counts_sum * u | ||
) | ||
norm_gene_vars = norm_gene_vars.reshape(1, -1) | ||
|
||
# argsort twice gives ranks, small rank means most variable | ||
ranked_norm_gene_vars = np.argsort(np.argsort(-norm_gene_vars, axis=1), axis=1) | ||
|
||
# this is done in SelectIntegrationFeatures() in Seurat v3 | ||
ranked_norm_gene_vars = ranked_norm_gene_vars.astype(np.float32) | ||
num_batches_high_var = np.sum((ranked_norm_gene_vars < n_top_genes).astype(int), axis=0) | ||
ranked_norm_gene_vars[ranked_norm_gene_vars >= n_top_genes] = np.nan | ||
ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars) # type: ignore | ||
median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan) # type: ignore | ||
|
||
var_df = var_df.assign( | ||
highly_variable_nbatches=pd.Series(num_batches_high_var, index=var_df.index), | ||
highly_variable_rank=pd.Series(median_ranked, index=var_df.index), | ||
variances_norm=pd.Series(np.mean(norm_gene_vars, axis=0), index=var_df.index), | ||
) | ||
|
||
sorted_index = ( | ||
var_df[["highly_variable_rank", "highly_variable_nbatches"]] | ||
.sort_values( | ||
["highly_variable_rank", "highly_variable_nbatches"], | ||
ascending=[True, False], | ||
na_position="last", | ||
) | ||
.index | ||
) | ||
var_df["highly_variable"] = False | ||
var_df = var_df.drop(columns=["highly_variable_nbatches"]) | ||
var_df.loc[sorted_index[: int(n_top_genes)], "highly_variable"] = True | ||
return var_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import numba | ||
import numpy as np | ||
import numpy.typing as npt | ||
|
||
|
||
class OnlineMatrixMeanVariance: | ||
n_samples: int | ||
n_variables: int | ||
|
||
def __init__(self, n_samples: int, n_variables: int): | ||
""" | ||
Compute mean and variance for n_variables over n_samples, encoded | ||
in a COO format. Equivalent to: | ||
numpy.mean(data, axis=0) | ||
numpy.var(data, axix=0) | ||
where the input `data` is of shape (n_samples, n_variables) | ||
""" | ||
self.n_samples = n_samples | ||
self.n_variables = n_variables | ||
|
||
self.n_a = np.zeros((n_variables,), dtype=np.int32) | ||
self.u_a = np.zeros((n_variables,), dtype=np.float64) | ||
self.M2_a = np.zeros((n_variables,), dtype=np.float64) | ||
|
||
def update(self, coord_vec: npt.NDArray[np.int64], value_vec: npt.NDArray[np.float32]) -> None: | ||
_mean_variance_update(coord_vec, value_vec, self.n_a, self.u_a, self.M2_a) | ||
|
||
def finalize(self) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: | ||
""" | ||
Returns tuple containing mean and variance | ||
""" | ||
u, M2 = _mean_variance_finalize(self.n_samples, self.n_a, self.u_a, self.M2_a) | ||
|
||
# compute sample variance | ||
var = M2 / max(1, (self.n_samples - 1)) | ||
|
||
return u, var | ||
|
||
|
||
# TODO: add type signatures to annotation, removing need to do dynamic generation | ||
|
||
|
||
@numba.jit(nopython=True, nogil=True) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 | ||
def _mean_variance_update( | ||
col_arr: npt.NDArray[np.int64], | ||
val_arr: npt.NDArray[np.float32], | ||
n: npt.NDArray[np.int32], | ||
u: npt.NDArray[np.float64], | ||
M2: npt.NDArray[np.float64], | ||
) -> None: | ||
""" | ||
Incrementally accumulate mean and sum of square of distance from mean using | ||
Welford's online method. | ||
""" | ||
for col, val in zip(col_arr, val_arr): | ||
u_prev = u[col] | ||
M2_prev = M2[col] | ||
n[col] += 1 | ||
u[col] = u_prev + (val - u_prev) / n[col] | ||
M2[col] = M2_prev + (val - u_prev) * (val - u[col]) | ||
|
||
|
||
@numba.jit(nopython=True, nogil=True) # type: ignore[misc] # See https://github.com/numba/numba/issues/7424 | ||
def _mean_variance_finalize( | ||
n_samples: int, n_a: npt.NDArray[np.int32], u_a: npt.NDArray[np.float64], M2_a: npt.NDArray[np.float64] | ||
) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: | ||
""" | ||
Finalize incremental values, acconting for missing elements (due to sparse input). | ||
Non-sparse and sparse combined using Chan's parallel adaptation of Welford's. | ||
The code assumes the sparse elements are all zero and ignores those terms. | ||
""" | ||
n_b = n_samples - n_a | ||
delta = -u_a # assumes u_b == 0 | ||
u = (n_a * u_a) / n_samples | ||
M2 = M2_a + delta**2 * n_a * n_b / n_samples # assumes M2_b == 0 | ||
return u, M2 |
12 changes: 12 additions & 0 deletions
12
api/python/cell_census/src/cell_census/experiment_query/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from .axis import AxisQuery | ||
from .query import ExperimentQuery, experiment_query | ||
from .types import AxisColumnNames | ||
from .util import X_as_series | ||
|
||
__all__ = [ | ||
"experiment_query", | ||
"AxisColumnNames", | ||
"AxisQuery", | ||
"ExperimentQuery", | ||
"X_as_series", | ||
] |
35 changes: 35 additions & 0 deletions
35
api/python/cell_census/src/cell_census/experiment_query/anndata.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from typing import Dict, Tuple | ||
|
||
import anndata | ||
import pyarrow as pa | ||
import scipy.sparse as sparse | ||
|
||
from .types import ExperimentQueryReadArrowResult | ||
|
||
|
||
def arrow_to_scipy_csr(X: pa.Table, shape: Tuple[int, int]) -> sparse.csr_matrix: | ||
return sparse.csr_matrix((X["soma_data"].to_numpy(), (X["_dim_0"].to_numpy(), X["_dim_1"].to_numpy())), shape=shape) | ||
|
||
|
||
def make_anndata(query_result: ExperimentQueryReadArrowResult) -> anndata.AnnData: | ||
|
||
obs = query_result["obs"] | ||
obs = obs.to_pandas() | ||
obs.index = obs.index.map(str) | ||
|
||
var = query_result["var"] | ||
var = var.to_pandas() | ||
var.index = var.index.map(str) | ||
|
||
shape = (len(obs), len(var)) | ||
|
||
X = query_result.get("X", None) | ||
if X is not None: | ||
X = arrow_to_scipy_csr(X, shape) | ||
|
||
X_layers = query_result.get("X_layers", {}) | ||
layers: Dict[str, sparse.csr_matrix] = {} | ||
for X_layer_name, X_layer_table in X_layers.items(): | ||
layers[X_layer_name] = arrow_to_scipy_csr(X_layer_table, shape) | ||
|
||
return anndata.AnnData(X=X, obs=obs, var=var, layers=(layers if len(layers) else None)) |
Oops, something went wrong.