Skip to content

Commit

Permalink
Merge pull request #75 from ArcInstitute/dev
Browse files Browse the repository at this point in the history
  • Loading branch information
abearab authored Jul 13, 2024
2 parents 905bc98 + 5211d41 commit 6a51c4b
Show file tree
Hide file tree
Showing 15 changed files with 520 additions and 413 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ The `AnnData` object must have the following contents:
used to store gene names. For dual or multiple targeting sgRNA libraries, this column can be used to store gene pairs
or any other relevant information about the target.
- "targetType": the type of target for each entry in reference sgRNA library. Note that this column is used to
distinguish between different types of sgRNAs in the library and negative control sgRNAs can be defined as `"targetType" == "negCtrl"`.
distinguish between different types of sgRNAs in the library and negative control sgRNAs can be defined as `"targetType" == "negative_control"`.
This is important for the phenotype calculation step.


Expand Down Expand Up @@ -220,8 +220,6 @@ screen.calculateDrugScreen(
t0='T0',
untreated='DMSO', # replace with the label for untreated condition
treated='Drug', # replace with the label for treated condition
db_untreated=1, # replace with doubling rate of untreated condition
db_treated=1, # replace with doubling rate of treated condition
score_level='compare_reps'
)
```
Expand Down
12 changes: 6 additions & 6 deletions docs/source/history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@ History

0.4.0 - after (June 2024 - July 2024)
~~~~~~~~~~~~~~~~~~~
* add command line interface, i.e. `screenpro --help`
* rename `Counter` class to `GuideCounter` for code clarity
* add command line interface, i.e. ``screenpro --help``
* rename ``Counter`` class to ``GuideCounter`` for code clarity
* major bug fixes and improvements in code formatting

0.2.11 - 0.3.5 (Apr 2024 - June 2024)
~~~~~~~~~~~~~~~~~
* introduce `Counter` class as wrapper for `ngs` module
* introduce ``Counter`` class as wrapper for ``ngs`` module
* improve core functionalities for CLI
* major bug fixes

0.2.7 - 0.2.10 (Mar 2024 - Apr 2024)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* major reformatting and introduce alpha version of CLI
* introduce `ngs` module – process fastq files from single, dual, or multiplexed Cas9 and Cas12 screens
* introduce ``ngs`` module – process fastq files from single, dual, or multiplexed Cas9 and Cas12 screens

* Support both single-guide and dual-guide Cas9 library design `#37`_
(i.e. V2 and V3 genome-scale dCas9 screen platforms)
Expand All @@ -43,8 +43,8 @@ History
0.2.1 - 0.2.4 (July 2023 - Nov 2023)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Included PyPi package installation.
* Introduced `ScreenPro` class.
* Introduced a method in `ScreenPro` class for common drug screen analysis.
* Introduced ``ScreenPro`` class.
* Introduced a method in ``ScreenPro`` class for common drug screen analysis.

0.2.0 (May 2022)
~~~~~~~~~~~~~~~~
Expand Down
4 changes: 2 additions & 2 deletions screenpro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
'''

from . import phenoscore as ps

from . import preprocessing as pp
from . import ngs
from . import assays
from . import load
Expand All @@ -28,6 +28,6 @@
from .ngs import GuideCounter
from .assays import PooledScreens, GImaps

__version__ = "0.4.1"
__version__ = "0.4.2"
__author__ = "Abe Arab"
__email__ = '[email protected]' # "[email protected]"
144 changes: 78 additions & 66 deletions screenpro/assays.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
import anndata as ad
import scanpy as sc

from pydeseq2 import preprocessing
from .phenoscore.deseq import runDESeq, extractDESeqResults
from .phenoscore import runDESeq, extractDESeqResults
from .phenoscore import runPhenoScore, runPhenoScoreForReplicate
from .phenoscore.annotate import ann_score_df
from .preprocessing import addPseudoCount, findLowCounts, normalizeSeqDepth
from .phenoscore.annotate import annotateScoreTable, hit_dict
from copy import copy


Expand All @@ -24,12 +24,14 @@ class PooledScreens(object):
pooledScreens class for processing CRISPR screen datasets
"""

def __init__(self, adata, fc_transformation='log2', test='ttest', n_reps=3):
def __init__(self, adata, fc_transformation='log2', test='ttest', n_reps=3, verbose=False):
"""
Args:
adata (AnnData): AnnData object with adata.X as a matrix of sgRNA counts
fc_transformation (str): fold change transformation to apply for calculating phenotype scores
test (str): statistical test to use for calculating phenotype scores
n_reps (int): number of replicates to use for calculating phenotype scores
verbose (bool): whether to print verbose output
"""
self.adata = adata.copy()
self.pdata = None
Expand All @@ -38,6 +40,7 @@ def __init__(self, adata, fc_transformation='log2', test='ttest', n_reps=3):
self.n_reps = n_reps
self.phenotypes = {}
self.phenotype_names = []
self.verbose = verbose

# def __repr__(self):
# descriptions = ''
Expand Down Expand Up @@ -83,18 +86,34 @@ def _calculateGrowthFactor(self, untreated, treated, db_rate_col):

return out

def countNormalization(self):
def filterLowCounts(self, filter_type='either', minimum_reads=50):
"""
Filter low counts in adata.X
"""
findLowCounts(
self.adata,
filter_type=filter_type,
minimum_reads=minimum_reads,
verbose=self.verbose
)

self.adata = self.adata[:,~self.adata.var.low_count].copy()

def countNormalization(self, pseudo_count_value=0.5):
"""
Normalize the counts data in adata.X
"""
self.adata.layers['raw_counts'] = self.adata.X.copy()

# add pseudocount
addPseudoCount(self.adata, behavior='default', value=pseudo_count_value)

if self.verbose: print('Pseudocount added to counts.')

# normalize counts by sequencing depth
norm_counts, size_factors = preprocessing.deseq2_norm(self.adata.X)
# update adata object
self.adata.obs['size_factors'] = size_factors
self.adata.layers['seq_depth_norm'] = norm_counts
self.adata.X = self.adata.layers['seq_depth_norm']
normalizeSeqDepth(self.adata)

if self.verbose: print('Counts normalized by sequencing depth.')

def calculateDrugScreenDESeq(self, t0, untreated, treated, run_name=None, **kwargs):
"""
Expand Down Expand Up @@ -126,22 +145,26 @@ def calculateDrugScreenDESeq(self, t0, untreated, treated, run_name=None, **kwar
f'gamma:{gamma_name}': gamma, f'tau:{tau_name}': tau, f'rho:{rho_name}': rho
}, axis=1)

def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated, score_level, db_rate_col='pop_doublings', run_name=None, **kwargs):
def calculateDrugScreen(self, t0, untreated, treated, score_level, db_rate_col='pop_doublings', run_name=None, **kwargs):
"""
Calculate `gamma`, `rho`, and `tau` phenotype scores for a drug screen dataset in a given `score_level`.
To normalize by growth rate, the doubling rate of the untreated and treated conditions are required.
Args:
t0 (str): name of the untreated condition
untreated (str): name of the untreated condition
treated (str): name of the treated condition
db_untreated (float): doubling rate of the untreated condition
db_treated (float): doubling rate of the treated condition
score_level (str): name of the score level
db_rate_col (str): column name for the doubling rate, default is 'pop_doublings'
run_name (str): name for the phenotype calculation run
**kwargs: additional arguments to pass to runPhenoScore
"""
growth_factor_table = self._calculateGrowthFactor(
untreated = untreated, treated = treated, db_rate_col = db_rate_col
)

db_untreated=growth_factor_table.query(f'score=="gamma"')['growth_factor'].mean()
db_treated=growth_factor_table.query(f'score=="tau"')['growth_factor'].mean()

# calculate phenotype scores: gamma, tau, rho
gamma_name, gamma = runPhenoScore(
self.adata, cond1=t0, cond2=untreated, growth_rate=db_untreated,
Expand Down Expand Up @@ -174,10 +197,6 @@ def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated,
self._add_phenotype_results(f'tau:{tau_name}')
self._add_phenotype_results(f'rho:{rho_name}')

growth_factor_table = self._calculateGrowthFactor(
untreated = untreated, treated = treated, db_rate_col = db_rate_col
)

# get replicate level phenotype scores
pdata_df = pd.concat([
runPhenoScoreForReplicate(
Expand Down Expand Up @@ -227,33 +246,19 @@ def calculateFlowBasedScreen(self, low_bin, high_bin, score_level, run_name=None
# save phenotype name for reference
self._add_phenotype_results(f'delta:{delta_name}')

def getPhenotypeScores(self, score_name, run_name='auto', threshold=5, ctrl_label='control', target_col='target',pvalue_column='ttest pvalue', score_column='score'):
def getPhenotypeScores(self, score_name, threshold, run_name='auto', ctrl_label='negative_control', target_col='target',pvalue_col='ttest pvalue', score_col='score'):
"""
Get phenotype scores for a given score level
Args:
run_name (str): name of the phenotype calculation run to retrieve
score_name (str): name of the score to retrieve, e.g. 'gamma', 'tau', 'rho', 'delta'
threshold (float): threshold for filtering significant hits, default is 5
ctrl_label (str): label for the negative control, default is 'control'
run_name (str): name of the phenotype calculation run to retrieve
ctrl_label (str): label for the negative control, default is 'negative_control'
target_col (str): column name for the target gene, default is 'target'
pvalue_column (str): column name for the p-value, default is 'ttest pvalue'
score_column (str): column name for the score, default is 'score'
"""
hit_dict = {
'gamma':{
'up_hit':'up_hit',
'down_hit':'essential_hit'
},
'tau':{
'up_hit':'up_hit',
'down_hit':'down_hit'
},
'rho':{
'up_hit':'resistance_hit',
'down_hit':'sensitivity_hit'
}
}

if run_name == 'auto':
if len(list(self.phenotypes.keys())) == 1:
Expand All @@ -268,34 +273,35 @@ def getPhenotypeScores(self, score_name, run_name='auto', threshold=5, ctrl_labe
if score_name not in self.phenotype_names:
raise ValueError(f"Phenotype '{score_name}' not found in self.phenotype_names")

keep_col = [target_col, score_column, pvalue_column]
keep_col = [target_col, score_col, pvalue_col]
score_tag = score_name.split(':')[0]
out = ann_score_df(
out = annotateScoreTable(
self.phenotypes[run_name][score_name].loc[:,keep_col],
ctrl_label=ctrl_label,
threshold=threshold,
up_hit=hit_dict[score_tag]['up_hit'],
down_hit=hit_dict[score_tag]['down_hit'],
threshold=threshold
ctrl_label=ctrl_label,
score_col=score_col,
pvalue_col=pvalue_col
)

return out

def getAnnotatedTable(self, run_name='auto', threshold=5, ctrl_label='control', target_col='target',pvalue_column='ttest pvalue', score_column='score'):
hit_dict = {
'gamma':{
'up_hit':'up_hit',
'down_hit':'essential_hit'
},
'tau':{
'up_hit':'up_hit',
'down_hit':'down_hit'
},
'rho':{
'up_hit':'resistance_hit',
'down_hit':'sensitivity_hit'
}
}

def getAnnotatedTable(self, threshold, run_name='auto', ctrl_label='negative_control', target_col='target', pvalue_col='ttest pvalue', score_col='score'):
"""
Returns an annotated table with scores, labels, and replicate phenotypes.
Args:
threshold (int, optional): The threshold value for determining hits. Defaults to 5.
run_name (str, optional): The name of the phenotype calculation run. Defaults to 'auto'.
ctrl_label (str, optional): The label for the control group. Defaults to 'negative_control'.
target_col (str, optional): The column name for the target. Defaults to 'target'.
pvalue_column (str, optional): The column name for the p-value. Defaults to 'ttest pvalue'.
score_column (str, optional): The column name for the score. Defaults to 'score'.
Returns:
pandas.DataFrame: An annotated table with scores, labels, and replicate phenotypes.
"""
if run_name == 'auto':
if len(list(self.phenotypes.keys())) == 1:
run_name = list(self.phenotypes.keys())[0]
Expand All @@ -306,34 +312,40 @@ def getAnnotatedTable(self, run_name='auto', threshold=5, ctrl_label='control',
'' + ', '.join(self.phenotypes.keys())
)

keep_col = [target_col, score_column, pvalue_column]

keep_col = [target_col, score_col, pvalue_col]
score_names = {s for s, col in self.phenotypes[run_name].columns}
sort_var = self.adata.var.sort_values(['targetType','target']).index.to_list()

df_list = {}
for score_name in score_names:
score_tag = score_name.split(':')[0]
# get label
df_label = ann_score_df(

# get annotated table
df_ann = annotateScoreTable(
self.phenotypes[run_name][score_name].loc[:,keep_col],
up_hit=hit_dict[score_tag]['up_hit'],
down_hit=hit_dict[score_tag]['down_hit'],
score_col=score_col,
pvalue_col=pvalue_col,
ctrl_label=ctrl_label,
threshold=threshold
)['label']
)

# get replicate phe
df_phe_reps = self.pdata[self.pdata.obs.score.eq(score_tag)].to_df().T

# make table
df = pd.concat([
self.phenotypes['compare_reps'][score_name], df_phe_reps, df_label
df_ann.drop(columns=['label']),
df_phe_reps,
df_ann['label']
],axis=1).loc[sort_var,:]

df_list.update({score_name:df})

out = pd.concat(df_list,axis=1)

return out


Expand Down
Loading

0 comments on commit 6a51c4b

Please sign in to comment.