Merge pull request #147 from PixelgenTechnologies/feature/exe-1769-ds…

…b-implementation Feature/exe 1769 dsb implementation
PixelgenTechnologies · Jun 3, 2024 · 206c806 · 206c806
2 parents 7c69253 + 9562e38
commit 206c806
Show file tree

Hide file tree

Showing 7 changed files with 307 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [UNRELEASED] - 2024-??-??
 
+### Added
+
+* Add `dsb_normalization` function for normalization of marker abundance.
+
 ### Fixed
 
 * Fix a bug where `a_pixels_per_b_pixel` summary statistics where equal to the `b_pixels_per_a_pixel` statistics.

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -25,3 +25,7 @@
 - [local G](https://doi.org/10.1007/s11749-018-0599-x)
 
   > Bivand, R.S., Wong, D.W.S. Comparing implementations of global and local indicators of spatial association. TEST 27, 716–748 (2018). https://doi.org/10.1007/s11749-018-0599-x
+
+- [dsb](https://doi.org/10.1038/s41467-022-29356-8)
+
+  > Mulè, M.P., Martins, A.J. & Tsang, J.S. Normalizing and denoising protein expression data from droplet-based single cell profiling. Nat Commun 13, 2099 (2022). https://doi.org/10.1038/s41467-022-29356-8
diff --git a/src/pixelator/analysis/normalization/__init__.py b/src/pixelator/analysis/normalization/__init__.py
@@ -0,0 +1,66 @@
+"""Functions for the colocalization analysis in pixelator.
+
+Copyright © 2024 Pixelgen Technologies AB.
+"""
+
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import StandardScaler
+
+
+def _regress_out_confounder(pheno, exprs, rcond=1e-8):
+    """Linear regression to remove confounding factors from abundance data."""
+    design_matrix = np.column_stack((np.ones((len(pheno), 1)), pheno))
+    coefficients, res, rank, s = np.linalg.lstsq(design_matrix, exprs, rcond=rcond)
+    beta = coefficients[1:]  # remove intercept term
+    return exprs - design_matrix[:, 1:].dot(beta)
+
+
+def _get_background_abundance(dataframe: pd.DataFrame, axis=0):
+    """Fit a double gaussian distribution to the abundance data and return the mean of the first gaussian as an estimation of the background level."""
+    background = pd.Series(index=dataframe.index if axis == 0 else dataframe.columns)
+    scores = pd.Series(index=dataframe.index if axis == 0 else dataframe.columns)
+    gmm = GaussianMixture(n_components=2, max_iter=1000, random_state=0)
+    if axis not in {0, 1}:
+        raise ValueError(f"Axis was {axis}. Must be 0 or 1")
+    ax_iter = dataframe.index if axis == 0 else dataframe.columns
+    for i in ax_iter:
+        current_axis = dataframe.loc[i, :] if axis == 0 else dataframe.loc[:, i]
+        gmm = gmm.fit(current_axis.to_frame())
+        background[i] = np.min(gmm.means_)
+        scores[i] = np.abs(gmm.means_[1] - gmm.means_[0]) / np.sum(gmm.covariances_)
+    return background, scores
+
+
+def dsb_normalize(
+    raw_abundance: pd.DataFrame, isotype_controls: Union[List, None] = None
+):
+    """empty-droplet-free method as implemented in Mulè et. al. dsb package.
+
+    The normalization steps are: 1- log1p transformation, 2- remove background
+    abundance per marker, 3- regularize abundance per component.
+
+    :param raw_abundance: the raw abundance count data.
+    :param isotype_controls: list of isotype controls.
+    :return: normalized abundance data.
+    """
+    log_abundance = np.log1p(raw_abundance)
+    marker_background, _ = _get_background_abundance(log_abundance, axis=1)
+    log_abundance = log_abundance - marker_background
+    component_background, _ = _get_background_abundance(log_abundance, axis=0)
+
+    if isotype_controls is not None:
+        control_signals = log_abundance.loc[:, isotype_controls]
+        control_signals["component_background"] = component_background
+        control_signals = StandardScaler().fit_transform(control_signals)
+        pheno = PCA(n_components=1).fit_transform(control_signals)
+    else:
+        raise ValueError(f"At least one isotype control must be provided.")
+
+    normalized_abundance = _regress_out_confounder(pheno, log_abundance)
+
+    return normalized_abundance
diff --git a/tests/analysis/normalization/__init__.py b/tests/analysis/normalization/__init__.py
@@ -0,0 +1,5 @@
+"""
+Tests for the normalization modules
+
+Copyright © 2024 Pixelgen Technologies AB.
+"""
diff --git a/tests/analysis/normalization/test_normalization.py b/tests/analysis/normalization/test_normalization.py
@@ -0,0 +1,26 @@
+"""
+Tests for the normalization modules
+
+Copyright © 2024 Pixelgen Technologies AB.
+"""
+
+from pathlib import Path
+
+import pandas as pd
+from pandas.testing import assert_frame_equal
+from pixelator.analysis.normalization import dsb_normalize
+
+DATA_ROOT = Path(__file__).parents[2] / "data"
+
+
+def test_dsb_normalize():
+    input_data = pd.read_csv(
+        str(DATA_ROOT / "dsb_normalization_test_input.csv")
+    ).astype(float)
+    output_data = pd.read_csv(
+        str(DATA_ROOT / "dsb_normalization_test_output.csv")
+    ).astype(float)
+    output_data = output_data - output_data.iloc[0, :]
+    result = dsb_normalize(input_data, isotype_controls=["mIgG1", "mIgG2a", "mIgG2b"])
+    result = result - result.iloc[0, :]
+    assert_frame_equal(result, output_data, atol=0.05)