Skip to content

Commit

Permalink
Merge pull request #147 from PixelgenTechnologies/feature/exe-1769-ds…
Browse files Browse the repository at this point in the history
…b-implementation

Feature/exe 1769 dsb implementation
  • Loading branch information
ptajvar authored Jun 3, 2024
2 parents 7c69253 + 9562e38 commit 206c806
Show file tree
Hide file tree
Showing 7 changed files with 307 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [UNRELEASED] - 2024-??-??

### Added

* Add `dsb_normalization` function for normalization of marker abundance.

### Fixed

* Fix a bug where `a_pixels_per_b_pixel` summary statistics where equal to the `b_pixels_per_a_pixel` statistics.
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@
- [local G](https://doi.org/10.1007/s11749-018-0599-x)

> Bivand, R.S., Wong, D.W.S. Comparing implementations of global and local indicators of spatial association. TEST 27, 716–748 (2018). https://doi.org/10.1007/s11749-018-0599-x
- [dsb](https://doi.org/10.1038/s41467-022-29356-8)

> Mulè, M.P., Martins, A.J. & Tsang, J.S. Normalizing and denoising protein expression data from droplet-based single cell profiling. Nat Commun 13, 2099 (2022). https://doi.org/10.1038/s41467-022-29356-8
66 changes: 66 additions & 0 deletions src/pixelator/analysis/normalization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""Functions for the colocalization analysis in pixelator.
Copyright © 2024 Pixelgen Technologies AB.
"""

from typing import List, Union

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler


def _regress_out_confounder(pheno, exprs, rcond=1e-8):
"""Linear regression to remove confounding factors from abundance data."""
design_matrix = np.column_stack((np.ones((len(pheno), 1)), pheno))
coefficients, res, rank, s = np.linalg.lstsq(design_matrix, exprs, rcond=rcond)
beta = coefficients[1:] # remove intercept term
return exprs - design_matrix[:, 1:].dot(beta)


def _get_background_abundance(dataframe: pd.DataFrame, axis=0):
"""Fit a double gaussian distribution to the abundance data and return the mean of the first gaussian as an estimation of the background level."""
background = pd.Series(index=dataframe.index if axis == 0 else dataframe.columns)
scores = pd.Series(index=dataframe.index if axis == 0 else dataframe.columns)
gmm = GaussianMixture(n_components=2, max_iter=1000, random_state=0)
if axis not in {0, 1}:
raise ValueError(f"Axis was {axis}. Must be 0 or 1")
ax_iter = dataframe.index if axis == 0 else dataframe.columns
for i in ax_iter:
current_axis = dataframe.loc[i, :] if axis == 0 else dataframe.loc[:, i]
gmm = gmm.fit(current_axis.to_frame())
background[i] = np.min(gmm.means_)
scores[i] = np.abs(gmm.means_[1] - gmm.means_[0]) / np.sum(gmm.covariances_)
return background, scores


def dsb_normalize(
raw_abundance: pd.DataFrame, isotype_controls: Union[List, None] = None
):
"""empty-droplet-free method as implemented in Mulè et. al. dsb package.
The normalization steps are: 1- log1p transformation, 2- remove background
abundance per marker, 3- regularize abundance per component.
:param raw_abundance: the raw abundance count data.
:param isotype_controls: list of isotype controls.
:return: normalized abundance data.
"""
log_abundance = np.log1p(raw_abundance)
marker_background, _ = _get_background_abundance(log_abundance, axis=1)
log_abundance = log_abundance - marker_background
component_background, _ = _get_background_abundance(log_abundance, axis=0)

if isotype_controls is not None:
control_signals = log_abundance.loc[:, isotype_controls]
control_signals["component_background"] = component_background
control_signals = StandardScaler().fit_transform(control_signals)
pheno = PCA(n_components=1).fit_transform(control_signals)
else:
raise ValueError(f"At least one isotype control must be provided.")

normalized_abundance = _regress_out_confounder(pheno, log_abundance)

return normalized_abundance
5 changes: 5 additions & 0 deletions tests/analysis/normalization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
Tests for the normalization modules
Copyright © 2024 Pixelgen Technologies AB.
"""
26 changes: 26 additions & 0 deletions tests/analysis/normalization/test_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
Tests for the normalization modules
Copyright © 2024 Pixelgen Technologies AB.
"""

from pathlib import Path

import pandas as pd
from pandas.testing import assert_frame_equal
from pixelator.analysis.normalization import dsb_normalize

DATA_ROOT = Path(__file__).parents[2] / "data"


def test_dsb_normalize():
input_data = pd.read_csv(
str(DATA_ROOT / "dsb_normalization_test_input.csv")
).astype(float)
output_data = pd.read_csv(
str(DATA_ROOT / "dsb_normalization_test_output.csv")
).astype(float)
output_data = output_data - output_data.iloc[0, :]
result = dsb_normalize(input_data, isotype_controls=["mIgG1", "mIgG2a", "mIgG2b"])
result = result - result.iloc[0, :]
assert_frame_equal(result, output_data, atol=0.05)
Loading

0 comments on commit 206c806

Please sign in to comment.