diff --git a/docs/release-notes/0.12.0.md b/docs/release-notes/0.12.0.md new file mode 100644 index 00000000..ecfe99bc --- /dev/null +++ b/docs/release-notes/0.12.0.md @@ -0,0 +1,14 @@ +### 0.12.0 {small}`the-future` + +```{rubric} Features +``` +* Updates `pp.filter_genes` & `pp.filter_cells` to be closer to the `scanpy` implementation {pr}`324` {smaller}`S Dicks` + +```{rubric} Performance +``` + +```{rubric} Bug fixes +``` + +```{rubric} Misc +``` diff --git a/docs/release-notes/index.md b/docs/release-notes/index.md index b3f4c1e5..a2f3ad9f 100644 --- a/docs/release-notes/index.md +++ b/docs/release-notes/index.md @@ -2,6 +2,10 @@ # Release notes +## Version 0.12.0 +```{include} /release-notes/0.12.0.md +``` + ## Version 0.11.0 ```{include} /release-notes/0.11.1.md ``` diff --git a/notebooks b/notebooks index 2229393d..3d1c1e5e 160000 --- a/notebooks +++ b/notebooks @@ -1 +1 @@ -Subproject commit 2229393de30dbef6cd36e387e4b4984b946bcc90 +Subproject commit 3d1c1e5e54cb6ce1b0370bdfe68227220b6e4a3b diff --git a/src/rapids_singlecell/preprocessing/_scrublet/__init__.py b/src/rapids_singlecell/preprocessing/_scrublet/__init__.py index 0eac21fd..39947997 100644 --- a/src/rapids_singlecell/preprocessing/_scrublet/__init__.py +++ b/src/rapids_singlecell/preprocessing/_scrublet/__init__.py @@ -187,10 +187,8 @@ def _run_scrublet(ad_obs: AnnData, ad_sim: AnnData | None = None): # counts and simulating doublets if ad_sim is None: - pp.filter_genes(ad_obs, min_count=3, verbose=False) - pp.filter_cells( - ad_obs, min_count=3, qc_var="n_genes_by_counts", verbose=False - ) + pp.filter_genes(ad_obs, min_cells=3, verbose=False) + pp.filter_cells(ad_obs, min_genes=3, verbose=False) # Doublet simulation will be based on the un-normalised counts, but on the # selection of genes following normalisation and variability filtering. So diff --git a/src/rapids_singlecell/preprocessing/_simple.py b/src/rapids_singlecell/preprocessing/_simple.py index ab06e08b..94db49a3 100644 --- a/src/rapids_singlecell/preprocessing/_simple.py +++ b/src/rapids_singlecell/preprocessing/_simple.py @@ -3,12 +3,15 @@ from typing import TYPE_CHECKING import cupy as cp -import numpy as np +from anndata import AnnData -from ._qc import calculate_qc_metrics +from ._qc import _basic_qc +from ._utils import _check_gpu_X if TYPE_CHECKING: - from anndata import AnnData + import numpy as np + + from rapids_singlecell._utils import ArrayTypesDask def flag_gene_family( @@ -52,149 +55,251 @@ def flag_gene_family( def filter_genes( - adata: AnnData, + data: AnnData | ArrayTypesDask, *, - qc_var: str = "n_cells_by_counts", - min_count: int = None, - max_count: int = None, + min_counts: int | None = None, + min_cells: int | None = None, + max_counts: int | None = None, + max_cells: int | None = None, + inplace: bool = True, verbose: bool = True, -) -> None: - """ +) -> tuple[np.ndarray, np.ndarray] | None: + """\ Filter genes based on number of cells or counts. - Filters genes, that have greater than a max number of genes or less than - a minimum number of a feature in a given :attr:`~anndata.AnnData.var` columns. Can so far only be used for numerical columns. - You can run this function on 'n_cells' or 'n_counts' with a previous columns in :attr:`~anndata.AnnData.var`. + Keep genes that have at least `min_counts` counts or are expressed in at + least `min_cells` cells or have at most `max_counts` counts or are expressed + in at most `max_cells` cells. + + Only provide one of the optional parameters `min_counts`, `min_cells`, + `max_counts`, `max_cells` per call. Parameters ---------- - adata: - AnnData object - - qc_var - column in :attr:`~anndata.AnnData.var` with numerical entries to filter against - - min_count - Lower bound on number of a given feature to keep gene - - max_count - Upper bound on number of a given feature to keep gene - - verbose - Print number of discarded genes + data + An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond + to cells and columns to genes. + min_counts + Minimum number of counts required for a gene to pass filtering. + min_cells + Minimum number of cells expressed required for a gene to pass filtering. + max_counts + Maximum number of counts required for a gene to pass filtering. + max_cells + Maximum number of cells expressed required for a gene to pass filtering. + inplace + Perform computation inplace or return result. + verbose + Print number of discarded genes Returns ------- - a filtered :class:`~anndata.AnnData` object inplace - + Depending on `inplace`, returns the following arrays or directly subsets + and annotates the data matrix + + gene_subset + Boolean index mask that does filtering. `True` means that the + gene is kept. `False` means the gene is removed. + number_per_gene + Depending on what was thresholded (`counts` or `cells`), the array stores + `n_counts` or `n_cells` per gene. """ - if qc_var in adata.var.keys(): - if min_count is not None and max_count is not None: - thr = (adata.var[qc_var] <= max_count) & (min_count <= adata.var[qc_var]) - elif min_count is not None: - thr = adata.var[qc_var] >= min_count - elif max_count is not None: - thr = adata.var[qc_var] <= max_count - - if verbose: - print( - f"filtered out {adata.var.shape[0] - thr.sum()} genes based on {qc_var}" - ) - - adata._inplace_subset_var(thr) - - elif qc_var in [ - "n_cells_by_counts", - "total_counts", - "mean_counts", - "pct_dropout_by_counts", - ]: - if verbose: - print( - "Running `calculate_qc_metrics` for 'n_cells_by_counts','total_counts','mean_counts' or 'pct_dropout_by_counts'" - ) - calculate_qc_metrics(adata=adata, log1p=False) - if min_count is not None and max_count is not None: - thr = (adata.var[qc_var] <= max_count) & (min_count <= adata.var[qc_var]) - elif min_count is not None: - thr = adata.var[qc_var] >= min_count - elif max_count is not None: - thr = adata.var[qc_var] <= max_count - if verbose: - print( - f"filtered out {adata.var.shape[0] - thr.sum()} genes based on {qc_var}" - ) - - adata._inplace_subset_var(thr) + n_given_options = sum( + option is not None for option in [min_cells, min_counts, max_cells, max_counts] + ) + if n_given_options != 1: + msg = ( + "Only provide one of the optional parameters `min_counts`, " + "`min_cells`, `max_counts`, `max_cells` per call." + ) + raise ValueError(msg) + + if isinstance(data, AnnData): + X = data.X + else: + X = data # proceed with processing the data matrix + + _check_gpu_X(X, allow_dask=True) + _, sums_genes, _, n_cells_per_gene = _basic_qc(X=X) + min_number = min_counts if min_cells is None else min_cells + max_number = max_counts if max_cells is None else max_cells + number_per_gene = ( + n_cells_per_gene + if (min_cells is not None or max_cells is not None) + else sums_genes + ) + if min_number is not None: + gene_subset = number_per_gene >= min_number + if max_number is not None: + gene_subset = number_per_gene <= max_number + + if verbose: + s = cp.sum(~gene_subset) + if s > 0: + msg = f"filtered out {s} genes that are detected " + if min_cells is not None or min_counts is not None: + msg += "in less than " + msg += ( + f"{min_cells} cells" + if min_counts is None + else f"{min_counts} counts" + ) + if max_cells is not None or max_counts is not None: + msg += "in more than " + msg += ( + f"{max_cells} cells" + if max_counts is None + else f"{max_counts} counts" + ) + print(msg) + + if isinstance(data, AnnData) and inplace: + data.var["n_counts"] = sums_genes.get() + + data.var["n_cells"] = n_cells_per_gene.get() + data._inplace_subset_var(gene_subset.get()) else: - print("please check qc_var") + return gene_subset.get(), number_per_gene.get() def filter_cells( - adata: AnnData, + data: AnnData | ArrayTypesDask, *, - qc_var: str, - min_count: float = None, - max_count: float = None, + min_counts: int | None = None, + min_genes: int | None = None, + max_counts: int | None = None, + max_genes: int | None = None, + inplace: bool = True, verbose: bool = True, -) -> None: +) -> tuple[np.ndarray, np.ndarray] | None: """\ Filter cell outliers based on counts and numbers of genes expressed. - Filter cells based on numerical columns in the :attr:`~anndata.AnnData.obs` by selecting those with a feature count greater than a specified maximum or less than a specified minimum. - It is recommended to run :func:`calculate_qc_metrics` before using this function. You can run this function on n_genes or n_counts before running :func:`calculate_qc_metrics`. + For instance, only keep cells with at least `min_counts` counts or + `min_genes` genes expressed. This is to filter measurement outliers, + i.e. “unreliable” observations. + + Only provide one of the optional parameters `min_counts`, `min_genes`, + `max_counts`, `max_genes` per call. Parameters ---------- - adata: - AnnData object - qc_var - column in .obs with numerical entries to filter against - min_count - Lower bound on number of a given feature to keep cell - max_count - Upper bound on number of a given feature to keep cell - verbose - Print number of discarded cells + data + The (annotated) data matrix of shape `n_obs` × `n_vars`. + Rows correspond to cells and columns to genes. + min_counts + Minimum number of counts required for a cell to pass filtering. + min_genes + Minimum number of genes expressed required for a cell to pass filtering. + max_counts + Maximum number of counts required for a cell to pass filtering. + max_genes + Maximum number of genes expressed required for a cell to pass filtering. + inplace + Perform computation inplace or return result. + verbose + Print number of discarded cells Returns ------- - a filtered :class:`~anndata.AnnData` object inplace - + Depending on `inplace`, returns the following arrays or directly subsets + and annotates the data matrix: + + cells_subset + Boolean index mask that does filtering. `True` means that the + cell is kept. `False` means the cell is removed. + number_per_cell + Depending on what was thresholded (`counts` or `genes`), + the array stores `n_counts` or `n_cells` per gene. + + Examples + -------- + >>> import scanpy as sc + >>> adata = sc.datasets.krumsiek11() + UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`. + utils.warn_names_duplicates("obs") + >>> adata.obs_names_make_unique() + >>> adata.n_obs + 640 + >>> adata.var_names.tolist() # doctest: +NORMALIZE_WHITESPACE + ['Gata2', 'Gata1', 'Fog1', 'EKLF', 'Fli1', 'SCL', + 'Cebpa', 'Pu.1', 'cJun', 'EgrNab', 'Gfi1'] + >>> # add some true zeros + >>> adata.X[adata.X < 0.3] = 0 + >>> # simply compute the number of genes per cell + >>> sc.pp.filter_cells(adata, min_genes=0) + >>> adata.n_obs + 640 + >>> int(adata.obs['n_genes'].min()) + 1 + >>> # filter manually + >>> adata_copy = adata[adata.obs['n_genes'] >= 3] + >>> adata_copy.n_obs + 554 + >>> int(adata_copy.obs['n_genes'].min()) + 3 + >>> # actually do some filtering + >>> sc.pp.filter_cells(adata, min_genes=3) + >>> adata.n_obs + 554 + >>> int(adata.obs['n_genes'].min()) + 3 """ - if qc_var in adata.obs.keys(): - inter = np.array - if min_count is not None and max_count is not None: - inter = (adata.obs[qc_var] <= max_count) & (min_count <= adata.obs[qc_var]) - elif min_count is not None: - inter = adata.obs[qc_var] >= min_count - elif max_count is not None: - inter = adata.obs[qc_var] <= max_count - else: - print("Please specify a cutoff to filter against") - if verbose: - print(f"filtered out {adata.obs.shape[0] - inter.sum()} cells") - adata._inplace_subset_obs(inter) - elif qc_var in ["n_genes_by_counts", "total_counts"]: - if verbose: - print( - "Running `calculate_qc_metrics` for 'n_cells_by_counts' or 'total_counts'" - ) - calculate_qc_metrics(adata, log1p=False) - inter = np.array - if min_count is not None and max_count is not None: - inter = (adata.obs[qc_var] <= max_count) & (min_count <= adata.obs[qc_var]) - elif min_count is not None: - inter = adata.obs[qc_var] >= min_count - elif max_count is not None: - inter = adata.obs[qc_var] <= max_count - else: - print("Please specify a cutoff to filter against") - if verbose: - print(f"filtered out {adata.obs.shape[0] - inter.sum()} cells") - adata._inplace_subset_obs(inter) + n_given_options = sum( + option is not None for option in [min_genes, min_counts, max_genes, max_counts] + ) + if n_given_options != 1: + msg = ( + "Only provide one of the optional parameters `min_counts`, " + "`min_genes`, `max_counts`, `max_genes` per call." + ) + raise ValueError(msg) + if isinstance(data, AnnData): + X = data.X + else: + X = data + + _check_gpu_X(X, allow_dask=True) + sums_cells, _, n_genes_per_cell, _ = _basic_qc(X=X) + min_number = min_counts if min_genes is None else min_genes + max_number = max_counts if max_genes is None else max_genes + number_per_cell = ( + n_genes_per_cell + if (min_genes is not None or max_genes is not None) + else sums_cells + ) + + if min_number is not None: + cell_subset = number_per_cell >= min_number + if max_number is not None: + cell_subset = number_per_cell <= max_number + + if verbose: + s = cp.sum(~cell_subset) + if s > 0: + msg = f"filtered out {s} cells that have " + if min_genes is not None or min_counts is not None: + msg += "less than " + msg += ( + f"{min_genes} genes expressed" + if min_counts is None + else f"{min_counts} counts" + ) + if max_genes is not None or max_counts is not None: + msg += "more than " + msg += ( + f"{max_genes} genes expressed" + if max_counts is None + else f"{max_counts} counts" + ) + print(msg) + + if isinstance(data, AnnData) and inplace: + data.obs["n_counts"] = sums_cells.get() + data.obs["n_genes"] = n_genes_per_cell.get() + data._inplace_subset_obs(cell_subset.get()) else: - print("Please check qc_var.") + return cell_subset.get(), number_per_cell.get() def filter_highly_variable(adata: AnnData) -> None: diff --git a/tests/dask/test_dask_pca.py b/tests/dask/test_dask_pca.py index 08d6d5d9..e228a550 100644 --- a/tests/dask/test_dask_pca.py +++ b/tests/dask/test_dask_pca.py @@ -67,8 +67,8 @@ def test_pca_dask_full_pipeline(client, data_kind): else: raise ValueError(f"Unknown data_kind {data_kind}") - rsc.pp.filter_genes(adata_1, min_count=500) - rsc.pp.filter_genes(adata_2, min_count=500) + rsc.pp.filter_genes(adata_1, min_cells=500) + rsc.pp.filter_genes(adata_2, min_cells=500) rsc.pp.normalize_total(adata_1, target_sum=1e4) rsc.pp.normalize_total(adata_2, target_sum=1e4) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py new file mode 100644 index 00000000..f9c76240 --- /dev/null +++ b/tests/test_preprocessing.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import cupy as cp +import pytest +from cupy.testing import assert_allclose +from cupyx.scipy.sparse import csc_matrix, csr_matrix, issparse +from scanpy.datasets import pbmc68k_reduced + +import rapids_singlecell as rsc + + +@pytest.mark.parametrize("array_type", (csr_matrix, csc_matrix, cp.ndarray)) +@pytest.mark.parametrize( + ("max_cells", "max_counts", "min_cells", "min_counts"), + [ + (100, None, None, None), + (None, 100, None, None), + (None, None, 20, None), + (None, None, None, 20), + ], +) +def test_filter_genes(array_type, max_cells, max_counts, min_cells, min_counts): + adata = pbmc68k_reduced() + adata.X = adata.raw.X.todense() + rsc.get.anndata_to_GPU(adata) + adata_casted = adata.copy() + if array_type is not cp.ndarray: + adata_casted.X = array_type(adata_casted.X) + rsc.pp.filter_genes( + adata, + max_cells=max_cells, + max_counts=max_counts, + min_cells=min_cells, + min_counts=min_counts, + ) + rsc.pp.filter_genes( + adata_casted, + max_cells=max_cells, + max_counts=max_counts, + min_cells=min_cells, + min_counts=min_counts, + ) + X = adata_casted.X + if issparse(X): + X = X.todense() + + assert_allclose(X, adata.X, rtol=1e-5, atol=1e-5) + + +@pytest.mark.parametrize("array_type", (csr_matrix, csc_matrix, cp.ndarray)) +@pytest.mark.parametrize( + ("max_genes", "max_counts", "min_genes", "min_counts"), + [ + (100, None, None, None), + (None, 100, None, None), + (None, None, 20, None), + (None, None, None, 20), + ], +) +def test_filter_cells(array_type, max_genes, max_counts, min_genes, min_counts): + adata = pbmc68k_reduced() + adata.X = adata.raw.X.todense() + rsc.get.anndata_to_GPU(adata) + adata_casted = adata.copy() + if array_type is not cp.ndarray: + adata_casted.X = array_type(adata_casted.X) + rsc.pp.filter_cells( + adata, + max_genes=max_genes, + max_counts=max_counts, + min_genes=min_genes, + min_counts=min_counts, + ) + rsc.pp.filter_cells( + adata_casted, + max_genes=max_genes, + max_counts=max_counts, + min_genes=min_genes, + min_counts=min_counts, + ) + X = adata_casted.X + if issparse(X): + X = X.todense() + assert_allclose(X, adata.X, rtol=1e-5, atol=1e-5) diff --git a/tests/test_scrublet.py b/tests/test_scrublet.py index abfe336c..d1b9005b 100644 --- a/tests/test_scrublet.py +++ b/tests/test_scrublet.py @@ -86,8 +86,8 @@ def test_scrublet_batched(): def _preprocess_for_scrublet(adata: AnnData) -> AnnData: adata_pp = adata.copy() - rsc.pp.filter_genes(adata_pp, min_count=3) - rsc.pp.filter_cells(adata_pp, qc_var="n_genes_by_counts", min_count=3) + rsc.pp.filter_genes(adata_pp, min_cells=3) + rsc.pp.filter_cells(adata_pp, min_genes=3) adata_pp.layers["raw"] = adata_pp.X.copy() rsc.pp.normalize_total(adata_pp) logged = rsc.pp.log1p(adata_pp, copy=True) @@ -169,7 +169,7 @@ def _scrub_small_sess() -> AnnData: # Reduce size of input for faster test adata = pbmc200() rsc.get.anndata_to_GPU(adata) - rsc.pp.filter_genes(adata, min_count=100, verbose=False) + rsc.pp.filter_genes(adata, min_cells=100) rsc.pp.scrublet(adata, use_approx_neighbors=False) return adata @@ -210,10 +210,8 @@ def test_scrublet_simulate_doublets(): """Check that doublet simulation runs and simulates some doublets.""" adata_obs = pbmc200() rsc.get.anndata_to_GPU(adata_obs) - rsc.pp.filter_genes(adata_obs, min_count=3, verbose=False) - rsc.pp.filter_cells( - adata_obs, min_count=3, qc_var="n_genes_by_counts", verbose=False - ) + rsc.pp.filter_genes(adata_obs, min_cells=3) + rsc.pp.filter_cells(adata_obs, min_genes=3) adata_obs.layers["raw"] = adata_obs.X rsc.pp.normalize_total(adata_obs) logged = rsc.pp.log1p(adata_obs, copy=True)