Skip to content

Commit

Permalink
Merge branch 'main' into issue260
Browse files Browse the repository at this point in the history
  • Loading branch information
CarlinLiao committed Dec 1, 2023
2 parents d49e575 + 01ea843 commit 72a270c
Show file tree
Hide file tree
Showing 9 changed files with 448 additions and 224 deletions.
15 changes: 10 additions & 5 deletions analysis_replication/README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@

# Reproducible analyses

The scripts in this directory reproduce the analyses of the curated datasets. They were written as a record of usage of the dashboard web application, which provides the same results.
The scripts in this directory reproduce the analyses of the curated datasets, in the order they are mentioned in the article.

You can run them on the public demo API,
These scripts were written as a record of usage of the dashboard web application, which provides the same results.

You can run them on the public demo API:

```sh
python reproducible_analyses/melanoma_il2.py oncopathtk.org/api
python run_all.py http://oncopathtk.org/api
```

or from your own local instance of the application,
Or you can run them from your own local instance of the application:

```sh
python reproducible_analyses/melanoma_il2.py "127.0.0.1:8080"
python run_all.py "http://127.0.0.1:8080"
```

substituting the argument with the address of your local API server. (See *Setting up a local application instance*).

- These scripts just call the web API, and so they do not require Python package `spatialprofilingtoolbox`.
- You can alternatively store the API host in `api_host.txt` and omit the command-line argument above.
- The run result is here in [results.txt](results.txt).
181 changes: 164 additions & 17 deletions analysis_replication/accessors.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,89 @@
"""Convenience caller of HTTP methods for data access."""
import sys

from typing import cast
import re
from itertools import chain
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.error import HTTPError
import json
from os.path import exists
from time import sleep

from pandas import DataFrame
from pandas import concat
from numpy import inf
from numpy import nan
from numpy import isnan
from numpy import mean
from numpy import log
from scipy.stats import ttest_ind # type: ignore


def get_default_host(given: str | None) -> str | None:
if given is not None:
return given
filename = 'api_host.txt'
if exists(filename):
with open(filename, 'rt', encoding='utf-8') as file:
host = file.read().rstrip()
else:
host = None
return host


class Colors:
bold_green = '\u001b[32;1m'
blue = '\u001b[34m'
bold_magenta = '\u001b[35;1m'
bold_red = '\u001b[31;1m'
yellow = '\u001b[33m'
reset = '\u001b[0m'


def sleep_poll():
seconds = 10
print(f'Waiting {seconds} seconds to poll.')
sleep(seconds)


class DataAccessor:
"""Convenience caller of HTTP methods for data access."""
def __init__(self, study, host='data.oncopathtk.org'):

def __init__(self, study, host=None):
_host = get_default_host(host)
if _host is None:
raise RuntimeError('Expected host name in api_host.txt .')
host = _host
use_http = False
if re.search('^http://', host):
use_http = True
host = re.sub(r'^http://', '', host)
self.host = host
self.study = study
self.use_http = use_http
print('\n' + Colors.bold_magenta + study + Colors.reset + '\n')
self.cohorts = self._retrieve_cohorts()
self.all_cells = self._retrieve_all_cells_counts()

def counts(self, phenotype_names):
if isinstance(phenotype_names, str):
phenotype_names = [phenotype_names]
conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
all_name = ' and '.join([self._name_phenotype(p) for p in phenotype_names])
all_name = self.name_for_all_phenotypes(phenotype_names)
conjunction_counts_series = self._get_counts_series(conjunction_criteria, all_name)
individual_counts_series = [
self._get_counts_series(self._phenotype_criteria(name), self._name_phenotype(name))
for name in phenotype_names
]
df = concat([self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series], axis=1)
df = concat(
[self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series],
axis=1,
)
df.replace([inf, -inf], nan, inplace=True)
return df

def name_for_all_phenotypes(self, phenotype_names):
return ' and '.join([self._name_phenotype(p) for p in phenotype_names])

def neighborhood_enrichment(self, phenotype_names):
feature_class = 'neighborhood enrichment'
Expand Down Expand Up @@ -70,12 +115,13 @@ def _one_phenotype_spatial_metric(self, phenotype_names, feature_class):
parts = parts1 + [('study', self.study), ('feature_class', feature_class)]
query = urlencode(parts)
endpoint = 'request-spatial-metrics-computation-custom-phenotype'
response, url = self._retrieve(endpoint, query)
if response['is_pending'] is True:
print('Computation is pending. Try again soon!')
print('URL:')
print(url)
sys.exit()

while True:
response, url = self._retrieve(endpoint, query)
if response['is_pending'] is True:
sleep_poll()
else:
break

rows = [
{'sample': key, '%s, %s' % (feature_class, ' and '.join(names)): value}
Expand Down Expand Up @@ -109,12 +155,13 @@ def _two_phenotype_spatial_metric(self, phenotype_names, feature_class):
parts.append(('radius', '100'))
query = urlencode(parts)
endpoint = 'request-spatial-metrics-computation-custom-phenotypes'
response, url = self._retrieve(endpoint, query)
if response['is_pending'] is True:
print('Computation is pending. Try again soon!')
print('URL:')
print(url)
sys.exit()

while True:
response, url = self._retrieve(endpoint, query)
if response['is_pending'] is True:
sleep_poll()
else:
break

rows = [
{'sample': key, '%s, %s' % (feature_class, ' and '.join(names)): value}
Expand Down Expand Up @@ -210,3 +257,103 @@ def _name_phenotype(self, phenotype):
for keyword, sign in zip(['positive', 'negative'], ['+', '-'])
]).rstrip()
return str(phenotype)

def important(self, phenotype_names: str | list[str]) -> dict[str, float]:
if isinstance(phenotype_names, str):
phenotype_names = [phenotype_names]
conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
parts = list(chain(*[
[(f'{keyword}_marker', channel) for channel in argument]
for keyword, argument in zip(
['positive', 'negative'], [
conjunction_criteria['positive_markers'],
conjunction_criteria['negative_markers'],
])
]))
parts = sorted(list(set(parts)))
parts.append(('study', self.study))
query = urlencode(parts)
phenotype_counts, _ = self._retrieve('cggnn-importance-composition', query)
return {c['specimen']: c['percentage'] for c in phenotype_counts['counts']}


class ExpectedQuantitativeValueError(ValueError):
"""
Raised when an expected quantitative result is significantly different from the expected value.
"""

def __init__(self, expected: float, actual: float):
error_percent = self.error_percent(expected, actual)
if error_percent is not None:
error_percent = round(100 * error_percent) / 100
message = f'''
Expected {expected} but got {Colors.bold_red}{actual}{Colors.reset}. Error is {error_percent}%.
'''
super().__init__(message)

@staticmethod
def is_error(expected: float, actual: float) -> bool:
error_percent = ExpectedQuantitativeValueError.error_percent(expected, actual)
if error_percent is None:
return True
if error_percent < 1.0:
return False
return True

@staticmethod
def error_percent(expected: float, actual: float) -> float | None:
if actual != 0:
error_percent = abs(100 * (1 - (actual / expected)))
else:
error_percent = None
return error_percent


def handle_expected_actual(expected: float, actual: float | None):
_actual = cast(float, actual)
if ExpectedQuantitativeValueError.is_error(expected, _actual):
raise ExpectedQuantitativeValueError(expected, _actual)
string = str(_actual)
padded = string + ' '*(21 - len(string))
print(Colors.bold_green + padded + Colors.reset, end='')


def univariate_pair_compare(
list1,
list2,
expected_fold=None,
do_log_fold: bool = False,
show_pvalue=False,
):
list1 = list(filter(lambda element: not isnan(element), list1.values))
list2 = list(filter(lambda element: not isnan(element), list2.values))

mean1 = float(mean(list1))
mean2 = float(mean(list2))
actual = mean2 / mean1
if expected_fold is not None:
handle_expected_actual(expected_fold, actual)
print((mean2, mean1, actual), end='')

if do_log_fold:
_list1 = [log(e) for e in list(filter(lambda element: element != 0, list1))]
_list2 = [log(e) for e in list(filter(lambda element: element != 0, list2))]
_mean1 = float(mean(_list1))
_mean2 = float(mean(_list2))
log_fold = _mean2 / _mean1
print(' log fold: ' + Colors.yellow + str(log_fold) + Colors.reset, end='')

if show_pvalue:
if do_log_fold:
result = ttest_ind(_list1, _list2)
print(
' p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end=''
)
else:
result = ttest_ind(list1, list2)
print(' p-value: ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')

print('')

def print_comparison() -> None:
pass
105 changes: 56 additions & 49 deletions analysis_replication/breast_imc.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,64 @@
"""Data analysis script for one dataset."""

import sys

from numpy import mean
from numpy import inf

from accessors import DataAccessor
from accessors import get_default_host
from accessors import univariate_pair_compare as compare

def test(host):
study = 'Breast cancer IMC'
access = DataAccessor(study, host=host)

KRT = {
'5': {'positive_markers': ['KRT5', 'CK'], 'negative_markers': []},
'7': {'positive_markers': ['KRT7', 'CK'], 'negative_markers': []},
'14': {'positive_markers': ['KRT14', 'CK'], 'negative_markers': []},
'19': {'positive_markers': ['KRT19', 'CK'], 'negative_markers': []},
}

df = access.proximity([KRT['14'], KRT['7']])
values1 = df[df['cohort'] == '1']['proximity, KRT14+ CK+ and KRT7+ CK+']
values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+']
# handle_expected_actual(1.6216, mean2 / mean1)
# # handle_expected_actual(1.69, mean2 / mean1)
compare(values1, values2, expected_fold=1.6216, show_pvalue=True)

df = access.proximity([KRT['14'], KRT['5']])
values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+']
values3 = df[df['cohort'] == '3']['proximity, KRT14+ CK+ and KRT5+ CK+']
# # handle_expected_actual(0.65, mean2 / mean3)
# handle_expected_actual(1.265, mean2 / mean3)
compare(values3, values2, expected_fold=1.265)

df = access.counts([KRT['14'], KRT['7']])
fractions = df['KRT14+ CK+ and KRT7+ CK+'] / df['all cells']
mean0 = float(mean(fractions))
print(mean0)

df = access.counts([KRT['14'], KRT['5']])
fractions = df['KRT14+ CK+ and KRT5+ CK+'] / df['all cells']
mean0 = float(mean(fractions))
print(mean0)

df = access.counts([KRT['14'], KRT['19']])
fractions = df['KRT14+ CK+'] / df['KRT19+ CK+']
fractions = fractions[fractions != inf]
fractions1 = fractions[df['cohort'] == '1']
fractions2 = fractions[df['cohort'] == '2']
fractions3 = fractions[df['cohort'] == '3']
compare(fractions3, fractions2, expected_fold=111.32, show_pvalue=True)
compare(fractions1, fractions2, expected_fold=11.39, show_pvalue=True)


study = 'Breast cancer IMC'
if len(sys.argv) == 1:
access = DataAccessor(study)
else:
access = DataAccessor(study, host=sys.argv[1])


KRT = {
'5': {'positive_markers': ['KRT5', 'CK'], 'negative_markers': []},
'7': {'positive_markers': ['KRT7', 'CK'], 'negative_markers': []},
'14': {'positive_markers': ['KRT14', 'CK'], 'negative_markers': []},
'19': {'positive_markers': ['KRT19', 'CK'], 'negative_markers': []},
}

df = access.proximity([KRT['14'], KRT['7']])
values1 = df[df['cohort'] == '1']['proximity, KRT14+ CK+ and KRT7+ CK+']
values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+']
mean1 = mean(values1)
mean2 = mean(values2)
print((mean2, mean1, mean2 / mean1))

df = access.proximity([KRT['14'], KRT['5']])
values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+']
values3 = df[df['cohort'] == '3']['proximity, KRT14+ CK+ and KRT5+ CK+']
mean2 = mean(values2)
mean3 = mean(values3)
print((mean2, mean3, mean2 / mean3))

df = access.counts([KRT['14'], KRT['7']])
fractions = df['KRT14+ CK+ and KRT7+ CK+'] / df['all cells']
print(mean(fractions))


df = access.counts([KRT['14'], KRT['5']])
fractions = df['KRT14+ CK+ and KRT5+ CK+'] / df['all cells']
print(mean(fractions))

df = access.counts([KRT['14'], KRT['19']])
fractions = df['KRT14+ CK+'] / df['KRT19+ CK+']
fractions = fractions[fractions != inf]
fractions1 = fractions[df['cohort'] == '1']
fractions2 = fractions[df['cohort'] == '2']
fractions3 = fractions[df['cohort'] == '3']
mean1 = mean(fractions1)
mean2 = mean(fractions2)
mean3 = mean(fractions3)
print((mean2, mean3, mean2 / mean3))
print((mean2, mean1, mean2 / mean1))
if __name__=='__main__':
host: str | None
if len(sys.argv) == 2:
host = sys.argv[1]
else:
host = get_default_host(None)
if host is None:
raise RuntimeError('Could not determine API server.')
test(get_default_host(None))
Loading

0 comments on commit 72a270c

Please sign in to comment.