Skip to content

Commit

Permalink
il2 importance analysis (#264)
Browse files Browse the repository at this point in the history
* first draft il2 importance analysis

* fix melanoma il2 importance analysis replication

* streamline phenotypes' importance analyzed
  • Loading branch information
CarlinLiao authored Dec 1, 2023
1 parent 78f9732 commit 01ea843
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 8 deletions.
46 changes: 40 additions & 6 deletions analysis_replication/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def sleep_poll():

class DataAccessor:
"""Convenience caller of HTTP methods for data access."""

def __init__(self, study, host=None):
_host = get_default_host(host)
if _host is None:
Expand All @@ -68,15 +69,21 @@ def counts(self, phenotype_names):
if isinstance(phenotype_names, str):
phenotype_names = [phenotype_names]
conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
all_name = ' and '.join([self._name_phenotype(p) for p in phenotype_names])
all_name = self.name_for_all_phenotypes(phenotype_names)
conjunction_counts_series = self._get_counts_series(conjunction_criteria, all_name)
individual_counts_series = [
self._get_counts_series(self._phenotype_criteria(name), self._name_phenotype(name))
for name in phenotype_names
]
df = concat([self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series], axis=1)
df = concat(
[self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series],
axis=1,
)
df.replace([inf, -inf], nan, inplace=True)
return df

def name_for_all_phenotypes(self, phenotype_names):
return ' and '.join([self._name_phenotype(p) for p in phenotype_names])

def neighborhood_enrichment(self, phenotype_names):
feature_class = 'neighborhood enrichment'
Expand Down Expand Up @@ -251,6 +258,24 @@ def _name_phenotype(self, phenotype):
]).rstrip()
return str(phenotype)

def important(self, phenotype_names: str | list[str]) -> dict[str, float]:
if isinstance(phenotype_names, str):
phenotype_names = [phenotype_names]
conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
parts = list(chain(*[
[(f'{keyword}_marker', channel) for channel in argument]
for keyword, argument in zip(
['positive', 'negative'], [
conjunction_criteria['positive_markers'],
conjunction_criteria['negative_markers'],
])
]))
parts = sorted(list(set(parts)))
parts.append(('study', self.study))
query = urlencode(parts)
phenotype_counts, _ = self._retrieve('cggnn-importance-composition', query)
return {c['specimen']: c['percentage'] for c in phenotype_counts['counts']}


class ExpectedQuantitativeValueError(ValueError):
"""
Expand Down Expand Up @@ -293,7 +318,13 @@ def handle_expected_actual(expected: float, actual: float | None):
print(Colors.bold_green + padded + Colors.reset, end='')


def univariate_pair_compare(list1, list2, expected_fold = None, do_log_fold: bool = False, show_pvalue = False):
def univariate_pair_compare(
list1,
list2,
expected_fold=None,
do_log_fold: bool = False,
show_pvalue=False,
):
list1 = list(filter(lambda element: not isnan(element), list1.values))
list2 = list(filter(lambda element: not isnan(element), list2.values))

Expand All @@ -302,7 +333,7 @@ def univariate_pair_compare(list1, list2, expected_fold = None, do_log_fold: boo
actual = mean2 / mean1
if expected_fold is not None:
handle_expected_actual(expected_fold, actual)
print((mean2, mean1, actual), end = '')
print((mean2, mean1, actual), end='')

if do_log_fold:
_list1 = [log(e) for e in list(filter(lambda element: element != 0, list1))]
Expand All @@ -315,11 +346,14 @@ def univariate_pair_compare(list1, list2, expected_fold = None, do_log_fold: boo
if show_pvalue:
if do_log_fold:
result = ttest_ind(_list1, _list2)
print(' p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')
print(
' p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end=''
)
else:
result = ttest_ind(list1, list2)
print(' p-value: ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')

print('')


def print_comparison() -> None:
pass
39 changes: 37 additions & 2 deletions analysis_replication/melanoma_il2.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
"""Data analysis script for one dataset."""
"""Data analysis script for one the "Melanoma intralesional IL2" study."""

import sys
from pandas import DataFrame, Series

from scipy.stats import fisher_exact

from accessors import DataAccessor
from accessors import get_default_host
from accessors import univariate_pair_compare as compare
from accessors import handle_expected_actual


def test(host):
study = 'Melanoma intralesional IL2'
Expand Down Expand Up @@ -50,7 +56,36 @@ def test(host):
values3 = df[df['cohort'] == '3']['neighborhood enrichment, B cell and B cell']
compare(values3, values1, expected_fold=80.45, do_log_fold=True)

if __name__=='__main__':
for phenotype, expected_baseline, expected_percentage, expected_p in [
('Adipocyte or Langerhans cell', 3.593e-2, 80, 6e-34),
([{'positive_markers': ['S100B'], 'negative_markers': ['SOX10']}], 6.710e-2, 82, 2.783e-21)
]:
result_df = DataFrame(columns=['odd ratio', 'p-value'])
result_df.index.name = 'specimen'
df = access.counts(phenotype)
df = df[df['cohort'].isin({'1', '3'})]
important_proportion = access.important(phenotype)
if (type(phenotype) is list):
phenotype = access.name_for_all_phenotypes(phenotype)
for specimen, row in df.iterrows():
n_cells_of_this_phenotype = row[phenotype][0]
n_cells_total = row['all cells']
p_important = important_proportion[specimen]
odd_ratio, p_value = fisher_exact([
[n_cells_of_this_phenotype, p_important],
[n_cells_total, 100],
])
result_df.loc[specimen] = [odd_ratio, p_value]
print(f'\nBaseline presence of {phenotype}')
handle_expected_actual(expected_baseline, (df[phenotype].iloc[:, 0]/df['all cells']).mean())
print(f'\nPercentage of top 100 most important cells')
handle_expected_actual(expected_percentage, Series(important_proportion).mean())
print(f'\nLargest p-value from fisher exact test across all specimens')
handle_expected_actual(expected_p, result_df['p-value'].max())
print('')


if __name__ == '__main__':
host: str | None
if len(sys.argv) == 2:
host = sys.argv[1]
Expand Down

0 comments on commit 01ea843

Please sign in to comment.