Merge pull request #76 from ArcInstitute/dev

minor changes in docs and codes
ArcInstitute · Jul 14, 2024 · b1d66e7 · b1d66e7
2 parents 6a51c4b + 5670534
commit b1d66e7
Show file tree

Hide file tree

Showing 9 changed files with 127 additions and 51 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -78,12 +78,14 @@
     'sphinx.ext.autodoc', 
     'sphinx.ext.napoleon',
     'sphinx.ext.intersphinx',
-    "sphinx.ext.extlinks",
+    'sphinx.ext.extlinks',
     'sphinx.ext.viewcode', 
-    "sphinxcontrib.bibtex",
+    'sphinxcontrib.bibtex',
     'myst_parser',
 ]
 
+suppress_warnings = ["myst.header"]
+
 # -- Options for HTML output -------------------------------------------
 # Activate the theme.
 html_theme = 'sphinx_rtd_theme'

diff --git a/docs/source/history.rst b/docs/source/history.rst
@@ -7,13 +7,13 @@ History
 * coming soon!
 
 0.4.0 - after (June 2024 - July 2024)
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * add command line interface, i.e. ``screenpro --help``
 * rename ``Counter`` class to ``GuideCounter`` for code clarity
 * major bug fixes and improvements in code formatting
 
 0.2.11 - 0.3.5 (Apr 2024 - June 2024)
-~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * introduce ``Counter`` class as wrapper for ``ngs`` module
 * improve core functionalities for CLI
 * major bug fixes

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -15,7 +15,7 @@ Welcome to ScreenPro2's documentation!
    assays
    ngs
    phenotype
-   visulize
+   visualize
    load
 
 .. toctree::

diff --git a/docs/source/phenotype.md b/docs/source/phenotype.md
@@ -1,43 +1,53 @@
 # Phenotype calculation modules
 
-Log ratio of $y$ vs $x$:
-
-$$\Delta=\log(\frac{\begin{bmatrix}{N_{y}}\end{bmatrix}_{(a,b)} + 1}{\begin{bmatrix}{N_{x}}\end{bmatrix}_{(a,b)} + 1})$$
-
--   $y \rightarrow$ condition $x$ (e.g. treated samples)
--   $x \rightarrow$ condition $y$ (e.g. $t_{0}$ samples)
--   $a \rightarrow$ number of library elements with sgRNAs targeting $T$
--   $b \rightarrow$ number of biological replicates, $R$ (e.g. 2 or 3)
--   $N_{x}$ \| $N_{y} \rightarrow$ read counts normalized for sequencing
-    depth in condition $x$ or $y$
+Log ratio of {math}`y` vs {math}`x`:
+
+```{math}
+\Delta =
+\log(\frac
+    {\begin{bmatrix}{N_{y}}\end{bmatrix}_{(a,b)} + 1}
+    {\begin{bmatrix}{N_{x}}\end{bmatrix}_{(a,b)} + 1}
+)
+```
+
+-   {math}`y \rightarrow` condition {math}`x` (e.g. treated samples)
+-   {math}`x \rightarrow` condition {math}`y` (e.g. {math}`t_{0}` samples)
+-   {math}`a \rightarrow` number of library elements with sgRNAs targeting {math}`T`
+-   {math}`b \rightarrow` number of biological replicates, {math}`R` (e.g. 2 or 3)
+-   {math}`N_{x}` \| {math}`N_{y} \rightarrow` read counts normalized for sequencing
+    depth in condition {math}`x` or {math}`y`
 
 Here is a formula for V3 library with single library element per gene
 (i.e. dual sgRNAs in one construct targeting same gene).
 
-Phenotype score for each $T$ comparing $y$ vs $x$:
+Phenotype score for each {math}`T` comparing {math}`y` vs {math}`x`:
 
-$$\text{PhenoScore}(T,x,y) =
+```{math}
+\text{PhenoScore}(T,x,y) =
 \left(
 \frac{
 \overline{\Delta_{(x,y)}}
 }{
 \text{median}( {\overline{\Delta_{(x_{ctrl},y_{ctrl})}}} )
 }
 \right)
-\times \frac{ 1 }{d_{growth}}$$
+\times \frac{ 1 }{d_{growth}}
+```
 
--   $\overline{\Delta(x,y)} \rightarrow$ log ratio averaged across
+-   {math}`\overline{\Delta(x,y)} \rightarrow` log ratio averaged across
     replicates
--   $T \rightarrow$ library elements with sgRNAs targeting $T$
--   $d_{growth} \rightarrow$ growth factor to normalize the phenotype
+-   {math}`T \rightarrow` library elements with sgRNAs targeting {math}`T`
+-   {math}`d_{growth} \rightarrow` growth factor to normalize the phenotype
     score.
 
-Statistical test comparing $y$ vs $x$ per each target, $T$:
+Statistical test comparing {math}`y` vs {math}`x` per each target, {math}`T`:
 
-$$\text{p-value}(T,x,y) = \text{t-test} \left(
+```{math}
+\text{p-value}(T,x,y) = \text{t-test} \left(
 \begin{bmatrix}{N_{x}}\end{bmatrix}_{(a,b)},
 \begin{bmatrix}{N_{y}}\end{bmatrix}_{(a,b)}
-\right)$$
+\right)
+```
 
 (see this wikipedia page: [Dependent t-test for paired
 samples](https://en.wikipedia.org/wiki/Student%27s_t-test#Dependent_t-test_for_paired_samples))
@@ -46,18 +56,6 @@ samples](https://en.wikipedia.org/wiki/Student%27s_t-test#Dependent_t-test_for_p
 module](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html))
 
 > This is a test for the null hypothesis that two related or repeated
-> samples have identical average (expected) values.
+> samples have identical average (expected) values).
 
 ___
-
-```{eval-rst}  
-.. automodule:: screenpro.phenoscore
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-.. automodule:: screenpro.phenostats
-   :members:
-   :undoc-members:
-   :show-inheritance:
-```
diff --git a/screenpro/__init__.py b/screenpro/__init__.py
@@ -28,6 +28,6 @@
 from .ngs import GuideCounter
 from .assays import PooledScreens, GImaps
 
-__version__ = "0.4.2"
+__version__ = "0.4.3"
 __author__ = "Abe Arab"
 __email__ = '[email protected]' # "[email protected]"
diff --git a/screenpro/phenoscore/__init__.py b/screenpro/phenoscore/__init__.py
@@ -16,7 +16,7 @@
 from .delta import calculatePhenotypeScore, matrixTest
 from .deseq import runDESeq, extractDESeqResults
 from .annotate import annotateScoreTable
-from .phenostat import getFDR
+from .phenostat import multipleTestsCorrection
 
 
 def generatePseudoGeneAnnData(adata, num_pseudogenes='auto', pseudogene_size='auto', ctrl_label='negative_control'):
@@ -131,7 +131,7 @@ def runPhenoScore(adata, cond1, cond2, transformation, score_level, test,
             growth_rate=growth_rate
         )
         # get adjusted p-values
-        adj_p_values = getFDR(p_values)
+        adj_p_values = multipleTestsCorrection(p_values)
 
         # get targets
         targets = adata.var['target'].to_list()
@@ -203,7 +203,7 @@ def runPhenoScore(adata, cond1, cond2, transformation, score_level, test,
         p_values = [np.mean(p) for p in p_values]
 
         # get adjusted p-values
-        adj_p_values = getFDR(p_values)
+        adj_p_values = multipleTestsCorrection(p_values)
 
         # combine results into a dataframe
         result = pd.concat([

diff --git a/screenpro/phenoscore/phenostat.py b/screenpro/phenoscore/phenostat.py
@@ -45,22 +45,26 @@ def matrixStat(x, y, test, level):
         raise ValueError(f'Test "{test}" not recognized')
 
 
-def getFDR(p_values, method='fdr_bh'):
+def multipleTestsCorrection(p_values, method='fdr_bh'):
     """
-    Calculate FDR.
+    Calculate adjusted p-values using multiple testing correction.
 
     Parameters:
         p_values (np.array): array of p-values
-        method (str): method to use for calculating FDR
+        method (str): method to use for multiple testing correction
     
     Returns:
         np.array: array of adjusted p-values
     """
-    # fill na with 1
-    p_values[np.isnan(p_values)] = 1
-    # Calculate the adjusted p-values using the Benjamini-Hochberg method
-    if p_values is None:
-        raise ValueError('p_values is None')
-    _, adj_p_values, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
+    if method == 'fdr_bh':
+        # fill na with 1
+        p_values[np.isnan(p_values)] = 1
+        # Calculate the adjusted p-values using the Benjamini-Hochberg method
+        if p_values is None:
+            raise ValueError('p_values is None')
+        _, adj_p_values, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
+
+    else:
+        raise ValueError(f'Method "{method}" not recognized')
 
     return adj_p_values
diff --git a/screenpro/visualize/__init__.py b/screenpro/visualize/__init__.py
@@ -5,7 +5,9 @@
 
 import numpy as np
 import scanpy as sc
-from .qc_plots import *
+from .qc_plots import plotReplicateScatter
+from .rank import rankPlot
+
 
 ## Phenotype plotting functions
 class DrugScreenPlotter:

diff --git a/screenpro/visualize/rank.py b/screenpro/visualize/rank.py
@@ -0,0 +1,70 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+from .utils import yellow_blue
+
+
+def rankPlot(df, rank_col, color_col=None, name_col='target', highlight_values_dict=None, xlabel='Rank', ylabel='Values', title='Rank Plot', ax=None, dot_size=1.5, highlight_size_factor=100, **args):
+    """
+    Plot the ranks against their values with specified color.
+
+    Args:
+        df (DataFrame): The input DataFrame.
+        rank_col (str): The column name containing the values to be ranked.
+        color_col (str): The column name containing the values to be used for color coding. Default is None.
+        name_col (str, optional): The column name containing the names of the values. Default is 'target'.
+        highlight_values_dict (dict, optional): A dictionary specifying the values to be highlighted. 
+            The keys are the highlight colors and the values are dictionaries with 'genes' and 'text' keys. 
+            'genes' is a list of values to be highlighted and 'text' is a boolean indicating whether to display 
+            the names of the highlighted values. Default is None.
+        xlabel (str, optional): The label for the x-axis. Default is 'Rank'.
+        ylabel (str, optional): The label for the y-axis. Default is 'Values'.
+        title (str, optional): The title of the plot. Default is 'Rank Plot'.
+        ax (matplotlib.axes.Axes, optional): The axis object to plot on. If not provided, a new axis will be created.
+        dot_size (float, optional): The size of the dots in the scatter plot. Default is 1.5.
+        highlight_size_factor (int, optional): The size factor for the highlighted dots. Default is 100.
+        **args: Additional keyword arguments to be passed to the scatter plot.
+
+    Returns:
+        matplotlib.axes.Axes: The axis object containing the plot.
+    """
+    # Create a new DataFrame with the values and their corresponding ranks
+    rank_df = df.copy()
+    rank_df['Rank'] = rank_df[rank_col].rank()
+    rank_df.sort_values('Rank', inplace=True)
+
+    # Use a color that is suitable for publications
+    if color_col is None:
+        color_col = 'darkgray'
+
+    # If no axis is provided, create one
+    if ax is None:
+        _, ax = plt.subplots()
+
+    # Plot the ranks against their values with specified color
+    rank_df.plot.scatter(
+        'Rank', rank_col, marker='o',
+        colormap=yellow_blue,
+        s=dot_size,
+        c=color_col, ax=ax,
+        colorbar=False,
+        **args
+    )
+
+    if highlight_values_dict is not None:
+        for highlight_color, highlight_values in highlight_values_dict.items():
+            highlight_ranks = rank_df[rank_df[name_col].isin(highlight_values['genes'])]
+            ax.plot(highlight_ranks['Rank'], highlight_ranks[rank_col], 'o', color=highlight_color, markersize=dot_size * highlight_size_factor)
+
+            if highlight_values['text'] is not False:
+                for i, row in highlight_ranks.iterrows():
+                    ax.text(row['Rank'] + .01, row[rank_col] + .001, row[name_col], fontsize=8, color=highlight_color, ha='right')
+
+    # Add labels and title
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.set_title(title)
+
+    # Customize the grid lines for a clean look
+    ax.grid(False)
+
+    return ax