il2 importance analysis (#264)

* first draft il2 importance analysis * fix melanoma il2 importance analysis replication * streamline phenotypes' importance analyzed
nadeemlab · Dec 1, 2023 · 01ea843 · 01ea843
1 parent 78f9732
commit 01ea843
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 8 deletions.
diff --git a/analysis_replication/accessors.py b/analysis_replication/accessors.py
@@ -48,6 +48,7 @@ def sleep_poll():
 
 class DataAccessor:
     """Convenience caller of HTTP methods for data access."""
+
     def __init__(self, study, host=None):
         _host = get_default_host(host)
         if _host is None:
@@ -68,15 +69,21 @@ def counts(self, phenotype_names):
         if isinstance(phenotype_names, str):
             phenotype_names = [phenotype_names]
         conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
-        all_name = ' and '.join([self._name_phenotype(p) for p in phenotype_names])
+        all_name = self.name_for_all_phenotypes(phenotype_names)
         conjunction_counts_series = self._get_counts_series(conjunction_criteria, all_name)
         individual_counts_series = [
             self._get_counts_series(self._phenotype_criteria(name), self._name_phenotype(name))
             for name in phenotype_names
         ]
-        df = concat([self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series], axis=1)
+        df = concat(
+            [self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series],
+            axis=1,
+        )
         df.replace([inf, -inf], nan, inplace=True)
         return df
+
+    def name_for_all_phenotypes(self, phenotype_names):
+        return ' and '.join([self._name_phenotype(p) for p in phenotype_names])
 
     def neighborhood_enrichment(self, phenotype_names):
         feature_class = 'neighborhood enrichment'
@@ -251,6 +258,24 @@ def _name_phenotype(self, phenotype):
             ]).rstrip()
         return str(phenotype)
 
+    def important(self, phenotype_names: str | list[str]) -> dict[str, float]:
+        if isinstance(phenotype_names, str):
+            phenotype_names = [phenotype_names]
+        conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
+        parts = list(chain(*[
+            [(f'{keyword}_marker', channel) for channel in argument]
+            for keyword, argument in zip(
+                ['positive', 'negative'], [
+                    conjunction_criteria['positive_markers'],
+                    conjunction_criteria['negative_markers'],
+                ])
+        ]))
+        parts = sorted(list(set(parts)))
+        parts.append(('study', self.study))
+        query = urlencode(parts)
+        phenotype_counts, _ = self._retrieve('cggnn-importance-composition', query)
+        return {c['specimen']: c['percentage'] for c in phenotype_counts['counts']}
+
 
 class ExpectedQuantitativeValueError(ValueError):
     """
@@ -293,7 +318,13 @@ def handle_expected_actual(expected: float, actual: float | None):
     print(Colors.bold_green + padded + Colors.reset, end='')
 
 
-def univariate_pair_compare(list1, list2, expected_fold = None, do_log_fold: bool = False, show_pvalue = False):
+def univariate_pair_compare(
+        list1,
+        list2,
+        expected_fold=None,
+        do_log_fold: bool = False,
+        show_pvalue=False,
+):
     list1 = list(filter(lambda element: not isnan(element), list1.values))
     list2 = list(filter(lambda element: not isnan(element), list2.values))
 
@@ -302,7 +333,7 @@ def univariate_pair_compare(list1, list2, expected_fold = None, do_log_fold: boo
     actual = mean2 / mean1
     if expected_fold is not None:
         handle_expected_actual(expected_fold, actual)
-    print((mean2, mean1, actual), end = '')
+    print((mean2, mean1, actual), end='')
 
     if do_log_fold:
         _list1 = [log(e) for e in list(filter(lambda element: element != 0, list1))]
@@ -315,11 +346,14 @@ def univariate_pair_compare(list1, list2, expected_fold = None, do_log_fold: boo
     if show_pvalue:
         if do_log_fold:
             result = ttest_ind(_list1, _list2)
-            print('  p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')
+            print(
+                '  p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end=''
+            )
         else:
             result = ttest_ind(list1, list2)
             print('  p-value: ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')
 
     print('')
 
-
+def print_comparison() -> None:
+    pass
diff --git a/analysis_replication/melanoma_il2.py b/analysis_replication/melanoma_il2.py
@@ -1,9 +1,15 @@
-"""Data analysis script for one dataset."""
+"""Data analysis script for one the "Melanoma intralesional IL2" study."""
+
 import sys
+from pandas import DataFrame, Series
+
+from scipy.stats import fisher_exact
 
 from accessors import DataAccessor
 from accessors import get_default_host
 from accessors import univariate_pair_compare as compare
+from accessors import handle_expected_actual
+
 
 def test(host):
     study = 'Melanoma intralesional IL2'
@@ -50,7 +56,36 @@ def test(host):
     values3 = df[df['cohort'] == '3']['neighborhood enrichment, B cell and B cell']
     compare(values3, values1, expected_fold=80.45, do_log_fold=True)
 
-if __name__=='__main__':
+    for phenotype, expected_baseline, expected_percentage, expected_p in [
+        ('Adipocyte or Langerhans cell', 3.593e-2, 80, 6e-34),
+        ([{'positive_markers': ['S100B'], 'negative_markers': ['SOX10']}], 6.710e-2, 82, 2.783e-21)
+    ]:
+        result_df = DataFrame(columns=['odd ratio', 'p-value'])
+        result_df.index.name = 'specimen'
+        df = access.counts(phenotype)
+        df = df[df['cohort'].isin({'1', '3'})]
+        important_proportion = access.important(phenotype)
+        if (type(phenotype) is list):
+            phenotype = access.name_for_all_phenotypes(phenotype)
+        for specimen, row in df.iterrows():
+            n_cells_of_this_phenotype = row[phenotype][0]
+            n_cells_total = row['all cells']
+            p_important = important_proportion[specimen]
+            odd_ratio, p_value = fisher_exact([
+                [n_cells_of_this_phenotype, p_important],
+                [n_cells_total, 100],
+            ])
+            result_df.loc[specimen] = [odd_ratio, p_value]
+        print(f'\nBaseline presence of {phenotype}')
+        handle_expected_actual(expected_baseline, (df[phenotype].iloc[:, 0]/df['all cells']).mean())
+        print(f'\nPercentage of top 100 most important cells')
+        handle_expected_actual(expected_percentage, Series(important_proportion).mean())
+        print(f'\nLargest p-value from fisher exact test across all specimens')
+        handle_expected_actual(expected_p, result_df['p-value'].max())
+        print('')
+
+
+if __name__ == '__main__':
     host: str | None
     if len(sys.argv) == 2:
         host = sys.argv[1]