Merge pull request #75 from ArcInstitute/dev

ArcInstitute · Jul 13, 2024 · 6a51c4b · 6a51c4b
2 parents 905bc98 + 5211d41
commit 6a51c4b
Show file tree

Hide file tree

Showing 15 changed files with 520 additions and 413 deletions.
diff --git a/README.md b/README.md
@@ -180,7 +180,7 @@ The `AnnData` object must have the following contents:
     used to store gene names. For dual or multiple targeting sgRNA libraries, this column can be used to store gene pairs
     or any other relevant information about the target.
   - "targetType": the type of target for each entry in reference sgRNA library. Note that this column is used to 
-    distinguish between different types of sgRNAs in the library and negative control sgRNAs can be defined as `"targetType" == "negCtrl"`.
+    distinguish between different types of sgRNAs in the library and negative control sgRNAs can be defined as `"targetType" == "negative_control"`.
     This is important for the phenotype calculation step.
 
 
@@ -220,8 +220,6 @@ screen.calculateDrugScreen(
   t0='T0',
   untreated='DMSO',  # replace with the label for untreated condition
   treated='Drug',    # replace with the label for treated condition
-  db_untreated=1,    # replace with doubling rate of untreated condition
-  db_treated=1,      # replace with doubling rate of treated condition
   score_level='compare_reps'
 )
 ```

diff --git a/docs/source/history.rst b/docs/source/history.rst
@@ -8,20 +8,20 @@ History
 
 0.4.0 - after (June 2024 - July 2024)
 ~~~~~~~~~~~~~~~~~~~
-* add command line interface, i.e. `screenpro --help`
-* rename `Counter` class to `GuideCounter` for code clarity
+* add command line interface, i.e. ``screenpro --help``
+* rename ``Counter`` class to ``GuideCounter`` for code clarity
 * major bug fixes and improvements in code formatting
 
 0.2.11 - 0.3.5 (Apr 2024 - June 2024)
 ~~~~~~~~~~~~~~~~~
-* introduce `Counter` class as wrapper for `ngs` module
+* introduce ``Counter`` class as wrapper for ``ngs`` module
 * improve core functionalities for CLI
 * major bug fixes
 
 0.2.7 - 0.2.10 (Mar 2024 - Apr 2024)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * major reformatting and introduce alpha version of CLI
-* introduce `ngs` module – process fastq files from single, dual, or multiplexed Cas9 and Cas12 screens
+* introduce ``ngs`` module – process fastq files from single, dual, or multiplexed Cas9 and Cas12 screens
 
   * Support both single-guide and dual-guide Cas9 library design `#37`_
     (i.e. V2 and V3 genome-scale dCas9 screen platforms)
@@ -43,8 +43,8 @@ History
 0.2.1 - 0.2.4 (July 2023 - Nov 2023)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * Included PyPi package installation.
-* Introduced `ScreenPro` class.
-* Introduced a method in `ScreenPro` class for common drug screen analysis.
+* Introduced ``ScreenPro`` class.
+* Introduced a method in ``ScreenPro`` class for common drug screen analysis.
 
 0.2.0 (May 2022)
 ~~~~~~~~~~~~~~~~

diff --git a/screenpro/__init__.py b/screenpro/__init__.py
@@ -19,7 +19,7 @@
 '''
 
 from . import phenoscore as ps
-
+from . import preprocessing as pp
 from . import ngs
 from . import assays
 from . import load
@@ -28,6 +28,6 @@
 from .ngs import GuideCounter
 from .assays import PooledScreens, GImaps
 
-__version__ = "0.4.1"
+__version__ = "0.4.2"
 __author__ = "Abe Arab"
 __email__ = '[email protected]' # "[email protected]"
diff --git a/screenpro/assays.py b/screenpro/assays.py
@@ -12,10 +12,10 @@
 import anndata as ad
 import scanpy as sc
 
-from pydeseq2 import preprocessing
-from .phenoscore.deseq import runDESeq, extractDESeqResults
+from .phenoscore import runDESeq, extractDESeqResults
 from .phenoscore import runPhenoScore, runPhenoScoreForReplicate
-from .phenoscore.annotate import ann_score_df
+from .preprocessing import addPseudoCount, findLowCounts, normalizeSeqDepth
+from .phenoscore.annotate import annotateScoreTable, hit_dict
 from copy import copy
 
 
@@ -24,12 +24,14 @@ class PooledScreens(object):
     pooledScreens class for processing CRISPR screen datasets
     """
 
-    def __init__(self, adata, fc_transformation='log2', test='ttest', n_reps=3):
+    def __init__(self, adata, fc_transformation='log2', test='ttest', n_reps=3, verbose=False):
         """
         Args:
             adata (AnnData): AnnData object with adata.X as a matrix of sgRNA counts
             fc_transformation (str): fold change transformation to apply for calculating phenotype scores
             test (str): statistical test to use for calculating phenotype scores
+            n_reps (int): number of replicates to use for calculating phenotype scores
+            verbose (bool): whether to print verbose output
         """
         self.adata = adata.copy()
         self.pdata = None
@@ -38,6 +40,7 @@ def __init__(self, adata, fc_transformation='log2', test='ttest', n_reps=3):
         self.n_reps = n_reps
         self.phenotypes = {}
         self.phenotype_names = []
+        self.verbose = verbose
 
     # def __repr__(self):
     #     descriptions = ''
@@ -83,18 +86,34 @@ def _calculateGrowthFactor(self, untreated, treated, db_rate_col):
 
         return out
 
-    def countNormalization(self):
+    def filterLowCounts(self, filter_type='either', minimum_reads=50):
+        """
+        Filter low counts in adata.X
+        """
+        findLowCounts(
+            self.adata, 
+            filter_type=filter_type, 
+            minimum_reads=minimum_reads,
+            verbose=self.verbose
+        )
+
+        self.adata = self.adata[:,~self.adata.var.low_count].copy()
+
+    def countNormalization(self, pseudo_count_value=0.5):
         """
         Normalize the counts data in adata.X
         """
         self.adata.layers['raw_counts'] = self.adata.X.copy()
 
+        # add pseudocount
+        addPseudoCount(self.adata, behavior='default', value=pseudo_count_value)
+
+        if self.verbose: print('Pseudocount added to counts.')
+
         # normalize counts by sequencing depth
-        norm_counts, size_factors = preprocessing.deseq2_norm(self.adata.X)
-        # update adata object
-        self.adata.obs['size_factors'] = size_factors
-        self.adata.layers['seq_depth_norm'] = norm_counts
-        self.adata.X = self.adata.layers['seq_depth_norm']
+        normalizeSeqDepth(self.adata)
+
+        if self.verbose: print('Counts normalized by sequencing depth.')
 
     def calculateDrugScreenDESeq(self, t0, untreated, treated, run_name=None, **kwargs):
         """
@@ -126,22 +145,26 @@ def calculateDrugScreenDESeq(self, t0, untreated, treated, run_name=None, **kwar
             f'gamma:{gamma_name}': gamma, f'tau:{tau_name}': tau, f'rho:{rho_name}': rho
         }, axis=1)        
 
-    def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated, score_level, db_rate_col='pop_doublings', run_name=None, **kwargs):
+    def calculateDrugScreen(self, t0, untreated, treated, score_level, db_rate_col='pop_doublings', run_name=None, **kwargs):
         """
         Calculate `gamma`, `rho`, and `tau` phenotype scores for a drug screen dataset in a given `score_level`.
-        To normalize by growth rate, the doubling rate of the untreated and treated conditions are required.
 
         Args:
             t0 (str): name of the untreated condition
             untreated (str): name of the untreated condition
             treated (str): name of the treated condition
-            db_untreated (float): doubling rate of the untreated condition
-            db_treated (float): doubling rate of the treated condition
             score_level (str): name of the score level
             db_rate_col (str): column name for the doubling rate, default is 'pop_doublings'
             run_name (str): name for the phenotype calculation run
             **kwargs: additional arguments to pass to runPhenoScore
         """
+        growth_factor_table = self._calculateGrowthFactor(
+            untreated = untreated, treated = treated, db_rate_col = db_rate_col
+        )
+
+        db_untreated=growth_factor_table.query(f'score=="gamma"')['growth_factor'].mean()
+        db_treated=growth_factor_table.query(f'score=="tau"')['growth_factor'].mean()
+
         # calculate phenotype scores: gamma, tau, rho
         gamma_name, gamma = runPhenoScore(
             self.adata, cond1=t0, cond2=untreated, growth_rate=db_untreated,
@@ -174,10 +197,6 @@ def calculateDrugScreen(self, t0, untreated, treated, db_untreated, db_treated,
         self._add_phenotype_results(f'tau:{tau_name}')
         self._add_phenotype_results(f'rho:{rho_name}')
 
-        growth_factor_table = self._calculateGrowthFactor(
-            untreated = untreated, treated = treated, db_rate_col = db_rate_col
-        )
-
         # get replicate level phenotype scores
         pdata_df = pd.concat([
             runPhenoScoreForReplicate(
@@ -227,33 +246,19 @@ def calculateFlowBasedScreen(self, low_bin, high_bin, score_level, run_name=None
         # save phenotype name for reference
         self._add_phenotype_results(f'delta:{delta_name}')
 
-    def getPhenotypeScores(self, score_name, run_name='auto', threshold=5, ctrl_label='control', target_col='target',pvalue_column='ttest pvalue', score_column='score'):
+    def getPhenotypeScores(self, score_name, threshold, run_name='auto', ctrl_label='negative_control', target_col='target',pvalue_col='ttest pvalue', score_col='score'):
         """
         Get phenotype scores for a given score level
 
         Args:
-            run_name (str): name of the phenotype calculation run to retrieve
             score_name (str): name of the score to retrieve, e.g. 'gamma', 'tau', 'rho', 'delta'
             threshold (float): threshold for filtering significant hits, default is 5
-            ctrl_label (str): label for the negative control, default is 'control'
+            run_name (str): name of the phenotype calculation run to retrieve
+            ctrl_label (str): label for the negative control, default is 'negative_control'
             target_col (str): column name for the target gene, default is 'target'
             pvalue_column (str): column name for the p-value, default is 'ttest pvalue'
             score_column (str): column name for the score, default is 'score'
         """
-        hit_dict = {
-            'gamma':{
-                'up_hit':'up_hit',
-                'down_hit':'essential_hit'
-            },
-            'tau':{
-                'up_hit':'up_hit', 
-                'down_hit':'down_hit'
-            },
-            'rho':{
-                'up_hit':'resistance_hit', 
-                'down_hit':'sensitivity_hit'
-            }
-        }
 
         if run_name == 'auto':
             if len(list(self.phenotypes.keys())) == 1:
@@ -268,34 +273,35 @@ def getPhenotypeScores(self, score_name, run_name='auto', threshold=5, ctrl_labe
         if score_name not in self.phenotype_names:
             raise ValueError(f"Phenotype '{score_name}' not found in self.phenotype_names")
 
-        keep_col = [target_col, score_column, pvalue_column]
+        keep_col = [target_col, score_col, pvalue_col]
         score_tag = score_name.split(':')[0]
-        out = ann_score_df(
+        out = annotateScoreTable(
             self.phenotypes[run_name][score_name].loc[:,keep_col],
-            ctrl_label=ctrl_label, 
+            threshold=threshold,
             up_hit=hit_dict[score_tag]['up_hit'],
             down_hit=hit_dict[score_tag]['down_hit'],
-            threshold=threshold
+            ctrl_label=ctrl_label, 
+            score_col=score_col,
+            pvalue_col=pvalue_col
         )
 
         return out
 
-    def getAnnotatedTable(self, run_name='auto', threshold=5, ctrl_label='control', target_col='target',pvalue_column='ttest pvalue', score_column='score'):
-        hit_dict = {
-            'gamma':{
-                'up_hit':'up_hit',
-                'down_hit':'essential_hit'
-            },
-            'tau':{
-                'up_hit':'up_hit', 
-                'down_hit':'down_hit'
-            },
-            'rho':{
-                'up_hit':'resistance_hit', 
-                'down_hit':'sensitivity_hit'
-            }
-        }
-
+    def getAnnotatedTable(self, threshold, run_name='auto', ctrl_label='negative_control', target_col='target', pvalue_col='ttest pvalue', score_col='score'):
+        """
+        Returns an annotated table with scores, labels, and replicate phenotypes.
+
+        Args:
+            threshold (int, optional): The threshold value for determining hits. Defaults to 5.
+            run_name (str, optional): The name of the phenotype calculation run. Defaults to 'auto'.
+            ctrl_label (str, optional): The label for the control group. Defaults to 'negative_control'.
+            target_col (str, optional): The column name for the target. Defaults to 'target'.
+            pvalue_column (str, optional): The column name for the p-value. Defaults to 'ttest pvalue'.
+            score_column (str, optional): The column name for the score. Defaults to 'score'.
+
+        Returns:
+            pandas.DataFrame: An annotated table with scores, labels, and replicate phenotypes.
+        """
         if run_name == 'auto':
             if len(list(self.phenotypes.keys())) == 1:
                 run_name = list(self.phenotypes.keys())[0]
@@ -306,34 +312,40 @@ def getAnnotatedTable(self, run_name='auto', threshold=5, ctrl_label='control',
                     '' + ', '.join(self.phenotypes.keys())
                 )
 
-        keep_col = [target_col, score_column, pvalue_column]
-
+        keep_col = [target_col, score_col, pvalue_col]
+        
         score_names = {s for s, col in self.phenotypes[run_name].columns}
         sort_var = self.adata.var.sort_values(['targetType','target']).index.to_list()
-        
+
         df_list = {}
         for score_name in score_names:
             score_tag = score_name.split(':')[0]
-            # get label
-            df_label = ann_score_df(
+
+            # get annotated table
+            df_ann = annotateScoreTable(
                 self.phenotypes[run_name][score_name].loc[:,keep_col],
                 up_hit=hit_dict[score_tag]['up_hit'],
                 down_hit=hit_dict[score_tag]['down_hit'],
+                score_col=score_col,
+                pvalue_col=pvalue_col,
                 ctrl_label=ctrl_label,
                 threshold=threshold
-            )['label']
+            )
+
             # get replicate phe
             df_phe_reps = self.pdata[self.pdata.obs.score.eq(score_tag)].to_df().T
-            
+
             # make table
             df = pd.concat([
-                self.phenotypes['compare_reps'][score_name], df_phe_reps, df_label
+                df_ann.drop(columns=['label']),
+                df_phe_reps, 
+                df_ann['label']
             ],axis=1).loc[sort_var,:]
-            
+
             df_list.update({score_name:df})
-        
+
         out = pd.concat(df_list,axis=1)
-        
+
         return out