Merge branch 'main' into issue260

nadeemlab · Dec 1, 2023 · 72a270c · 72a270c
2 parents d49e575 + 01ea843
commit 72a270c
Show file tree

Hide file tree

Showing 9 changed files with 448 additions and 224 deletions.
diff --git a/analysis_replication/README.md b/analysis_replication/README.md
@@ -1,19 +1,24 @@
 
 # Reproducible analyses
 
-The scripts in this directory reproduce the analyses of the curated datasets. They were written as a record of usage of the dashboard web application, which provides the same results.
+The scripts in this directory reproduce the analyses of the curated datasets, in the order they are mentioned in the article.
 
-You can run them on the public demo API,
+These scripts were written as a record of usage of the dashboard web application, which provides the same results.
+
+You can run them on the public demo API:
 
 ```sh
-python reproducible_analyses/melanoma_il2.py oncopathtk.org/api
+python run_all.py http://oncopathtk.org/api
 ```
 
-or from your own local instance of the application,
+Or you can run them from your own local instance of the application:
 
 ```sh
-python reproducible_analyses/melanoma_il2.py "127.0.0.1:8080"
+python run_all.py "http://127.0.0.1:8080"
 ```
 
 substituting the argument with the address of your local API server. (See *Setting up a local application instance*).
 
+- These scripts just call the web API, and so they do not require Python package `spatialprofilingtoolbox`.
+- You can alternatively store the API host in `api_host.txt` and omit the command-line argument above.
+- The run result is here in [results.txt](results.txt).
diff --git a/analysis_replication/accessors.py b/analysis_replication/accessors.py
@@ -1,44 +1,89 @@
 """Convenience caller of HTTP methods for data access."""
-import sys
-
+from typing import cast
 import re
 from itertools import chain
 from urllib.request import urlopen
 from urllib.parse import urlencode
 from urllib.error import HTTPError
 import json
+from os.path import exists
+from time import sleep
 
 from pandas import DataFrame
 from pandas import concat
 from numpy import inf
 from numpy import nan
+from numpy import isnan
+from numpy import mean
+from numpy import log
+from scipy.stats import ttest_ind  # type: ignore
+
+
+def get_default_host(given: str | None) -> str | None:
+    if given is not None:
+        return given
+    filename = 'api_host.txt'
+    if exists(filename):
+        with open(filename, 'rt', encoding='utf-8') as file:
+            host = file.read().rstrip()
+    else:
+        host = None
+    return host
+
+
+class Colors:
+    bold_green = '\u001b[32;1m'
+    blue = '\u001b[34m'
+    bold_magenta = '\u001b[35;1m'
+    bold_red = '\u001b[31;1m'
+    yellow = '\u001b[33m'
+    reset = '\u001b[0m'
+
+
+def sleep_poll():
+    seconds = 10
+    print(f'Waiting {seconds} seconds to poll.')
+    sleep(seconds)
+
 
 class DataAccessor:
     """Convenience caller of HTTP methods for data access."""
-    def __init__(self, study, host='data.oncopathtk.org'):
+
+    def __init__(self, study, host=None):
+        _host = get_default_host(host)
+        if _host is None:
+            raise RuntimeError('Expected host name in api_host.txt .')
+        host = _host
         use_http = False
         if re.search('^http://', host):
             use_http = True
             host = re.sub(r'^http://', '', host)
         self.host = host
         self.study = study
         self.use_http = use_http
+        print('\n' + Colors.bold_magenta + study + Colors.reset + '\n')
         self.cohorts = self._retrieve_cohorts()
         self.all_cells = self._retrieve_all_cells_counts()
 
     def counts(self, phenotype_names):
         if isinstance(phenotype_names, str):
             phenotype_names = [phenotype_names]
         conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
-        all_name = ' and '.join([self._name_phenotype(p) for p in phenotype_names])
+        all_name = self.name_for_all_phenotypes(phenotype_names)
         conjunction_counts_series = self._get_counts_series(conjunction_criteria, all_name)
         individual_counts_series = [
             self._get_counts_series(self._phenotype_criteria(name), self._name_phenotype(name))
             for name in phenotype_names
         ]
-        df = concat([self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series], axis=1)
+        df = concat(
+            [self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series],
+            axis=1,
+        )
         df.replace([inf, -inf], nan, inplace=True)
         return df
+
+    def name_for_all_phenotypes(self, phenotype_names):
+        return ' and '.join([self._name_phenotype(p) for p in phenotype_names])
 
     def neighborhood_enrichment(self, phenotype_names):
         feature_class = 'neighborhood enrichment'
@@ -70,12 +115,13 @@ def _one_phenotype_spatial_metric(self, phenotype_names, feature_class):
         parts = parts1 + [('study', self.study), ('feature_class', feature_class)]
         query = urlencode(parts)
         endpoint = 'request-spatial-metrics-computation-custom-phenotype'
-        response, url = self._retrieve(endpoint, query)
-        if response['is_pending'] is True:
-            print('Computation is pending. Try again soon!')
-            print('URL:')
-            print(url)
-            sys.exit()
+
+        while True:
+            response, url = self._retrieve(endpoint, query)
+            if response['is_pending'] is True:
+                sleep_poll()
+            else:
+                break
 
         rows = [
             {'sample': key, '%s, %s' % (feature_class, ' and '.join(names)): value}
@@ -109,12 +155,13 @@ def _two_phenotype_spatial_metric(self, phenotype_names, feature_class):
             parts.append(('radius', '100'))
         query = urlencode(parts)
         endpoint = 'request-spatial-metrics-computation-custom-phenotypes'
-        response, url = self._retrieve(endpoint, query)
-        if response['is_pending'] is True:
-            print('Computation is pending. Try again soon!')
-            print('URL:')
-            print(url)
-            sys.exit()
+
+        while True:
+            response, url = self._retrieve(endpoint, query)
+            if response['is_pending'] is True:
+                sleep_poll()
+            else:
+                break
 
         rows = [
             {'sample': key, '%s, %s' % (feature_class, ' and '.join(names)): value}
@@ -210,3 +257,103 @@ def _name_phenotype(self, phenotype):
                 for keyword, sign in zip(['positive', 'negative'], ['+', '-'])
             ]).rstrip()
         return str(phenotype)
+
+    def important(self, phenotype_names: str | list[str]) -> dict[str, float]:
+        if isinstance(phenotype_names, str):
+            phenotype_names = [phenotype_names]
+        conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names)
+        parts = list(chain(*[
+            [(f'{keyword}_marker', channel) for channel in argument]
+            for keyword, argument in zip(
+                ['positive', 'negative'], [
+                    conjunction_criteria['positive_markers'],
+                    conjunction_criteria['negative_markers'],
+                ])
+        ]))
+        parts = sorted(list(set(parts)))
+        parts.append(('study', self.study))
+        query = urlencode(parts)
+        phenotype_counts, _ = self._retrieve('cggnn-importance-composition', query)
+        return {c['specimen']: c['percentage'] for c in phenotype_counts['counts']}
+
+
+class ExpectedQuantitativeValueError(ValueError):
+    """
+    Raised when an expected quantitative result is significantly different from the expected value.
+    """
+
+    def __init__(self, expected: float, actual: float):
+        error_percent = self.error_percent(expected, actual)
+        if error_percent is not None:
+            error_percent = round(100 * error_percent) / 100
+        message = f'''
+        Expected {expected} but got {Colors.bold_red}{actual}{Colors.reset}. Error is {error_percent}%.
+        '''
+        super().__init__(message)
+
+    @staticmethod
+    def is_error(expected: float, actual: float) -> bool:
+        error_percent = ExpectedQuantitativeValueError.error_percent(expected, actual)
+        if error_percent is None:
+            return True
+        if error_percent < 1.0:
+            return False
+        return True
+
+    @staticmethod
+    def error_percent(expected: float, actual: float) -> float | None:
+        if actual != 0:
+            error_percent = abs(100 * (1 - (actual / expected)))
+        else:
+            error_percent = None
+        return error_percent
+
+
+def handle_expected_actual(expected: float, actual: float | None):
+    _actual = cast(float, actual)
+    if ExpectedQuantitativeValueError.is_error(expected, _actual):
+        raise ExpectedQuantitativeValueError(expected, _actual)
+    string = str(_actual)
+    padded = string + ' '*(21 - len(string))
+    print(Colors.bold_green + padded + Colors.reset, end='')
+
+
+def univariate_pair_compare(
+        list1,
+        list2,
+        expected_fold=None,
+        do_log_fold: bool = False,
+        show_pvalue=False,
+):
+    list1 = list(filter(lambda element: not isnan(element), list1.values))
+    list2 = list(filter(lambda element: not isnan(element), list2.values))
+
+    mean1 = float(mean(list1))
+    mean2 = float(mean(list2))
+    actual = mean2 / mean1
+    if expected_fold is not None:
+        handle_expected_actual(expected_fold, actual)
+    print((mean2, mean1, actual), end='')
+
+    if do_log_fold:
+        _list1 = [log(e) for e in list(filter(lambda element: element != 0, list1))]
+        _list2 = [log(e) for e in list(filter(lambda element: element != 0, list2))]
+        _mean1 = float(mean(_list1))
+        _mean2 = float(mean(_list2))
+        log_fold = _mean2 / _mean1
+        print('  log fold: ' + Colors.yellow + str(log_fold) + Colors.reset, end='')
+
+    if show_pvalue:
+        if do_log_fold:
+            result = ttest_ind(_list1, _list2)
+            print(
+                '  p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end=''
+            )
+        else:
+            result = ttest_ind(list1, list2)
+            print('  p-value: ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')
+
+    print('')
+
+def print_comparison() -> None:
+    pass
diff --git a/analysis_replication/breast_imc.py b/analysis_replication/breast_imc.py
@@ -1,57 +1,64 @@
 """Data analysis script for one dataset."""
-
 import sys
 
 from numpy import mean
 from numpy import inf
 
 from accessors import DataAccessor
+from accessors import get_default_host
+from accessors import univariate_pair_compare as compare
+
+def test(host):
+    study = 'Breast cancer IMC'
+    access = DataAccessor(study, host=host)
+
+    KRT = {
+    '5': {'positive_markers': ['KRT5', 'CK'], 'negative_markers': []},
+    '7': {'positive_markers': ['KRT7', 'CK'], 'negative_markers': []},
+    '14': {'positive_markers': ['KRT14', 'CK'], 'negative_markers': []},
+    '19': {'positive_markers': ['KRT19', 'CK'], 'negative_markers': []},
+    }
+
+    df = access.proximity([KRT['14'], KRT['7']])
+    values1 = df[df['cohort'] == '1']['proximity, KRT14+ CK+ and KRT7+ CK+']
+    values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+']
+    # handle_expected_actual(1.6216, mean2 / mean1)
+    # # handle_expected_actual(1.69, mean2 / mean1)
+    compare(values1, values2, expected_fold=1.6216, show_pvalue=True)
+
+    df = access.proximity([KRT['14'], KRT['5']])
+    values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+']
+    values3 = df[df['cohort'] == '3']['proximity, KRT14+ CK+ and KRT5+ CK+']
+    # # handle_expected_actual(0.65, mean2 / mean3)
+    # handle_expected_actual(1.265, mean2 / mean3)
+    compare(values3, values2, expected_fold=1.265)
+
+    df = access.counts([KRT['14'], KRT['7']])
+    fractions = df['KRT14+ CK+ and KRT7+ CK+'] / df['all cells']
+    mean0 = float(mean(fractions))
+    print(mean0)
+
+    df = access.counts([KRT['14'], KRT['5']])
+    fractions = df['KRT14+ CK+ and KRT5+ CK+'] / df['all cells']
+    mean0 = float(mean(fractions))
+    print(mean0)
+
+    df = access.counts([KRT['14'], KRT['19']])
+    fractions = df['KRT14+ CK+'] / df['KRT19+ CK+']
+    fractions = fractions[fractions != inf]
+    fractions1 = fractions[df['cohort'] == '1']
+    fractions2 = fractions[df['cohort'] == '2']
+    fractions3 = fractions[df['cohort'] == '3']
+    compare(fractions3, fractions2, expected_fold=111.32, show_pvalue=True)
+    compare(fractions1, fractions2, expected_fold=11.39, show_pvalue=True)
+
 
-study = 'Breast cancer IMC'
-if len(sys.argv) == 1:
-    access = DataAccessor(study)
-else:
-    access = DataAccessor(study, host=sys.argv[1])
-
-
-KRT = {
-   '5': {'positive_markers': ['KRT5', 'CK'], 'negative_markers': []},
-   '7': {'positive_markers': ['KRT7', 'CK'], 'negative_markers': []},
-   '14': {'positive_markers': ['KRT14', 'CK'], 'negative_markers': []},
-   '19': {'positive_markers': ['KRT19', 'CK'], 'negative_markers': []},
-}
-
-df = access.proximity([KRT['14'], KRT['7']])
-values1 = df[df['cohort'] == '1']['proximity, KRT14+ CK+ and KRT7+ CK+']
-values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+']
-mean1 = mean(values1)
-mean2 = mean(values2)
-print((mean2, mean1, mean2 / mean1))
-
-df = access.proximity([KRT['14'], KRT['5']])
-values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+']
-values3 = df[df['cohort'] == '3']['proximity, KRT14+ CK+ and KRT5+ CK+']
-mean2 = mean(values2)
-mean3 = mean(values3)
-print((mean2, mean3, mean2 / mean3))
-
-df = access.counts([KRT['14'], KRT['7']])
-fractions = df['KRT14+ CK+ and KRT7+ CK+'] / df['all cells']
-print(mean(fractions))
-
-
-df = access.counts([KRT['14'], KRT['5']])
-fractions = df['KRT14+ CK+ and KRT5+ CK+'] / df['all cells']
-print(mean(fractions))
-
-df = access.counts([KRT['14'], KRT['19']])
-fractions = df['KRT14+ CK+'] / df['KRT19+ CK+']
-fractions = fractions[fractions != inf]
-fractions1 = fractions[df['cohort'] == '1']
-fractions2 = fractions[df['cohort'] == '2']
-fractions3 = fractions[df['cohort'] == '3']
-mean1 = mean(fractions1)
-mean2 = mean(fractions2)
-mean3 = mean(fractions3)
-print((mean2, mean3, mean2 / mean3))
-print((mean2, mean1, mean2 / mean1))
+if __name__=='__main__':
+    host: str | None
+    if len(sys.argv) == 2:
+        host = sys.argv[1]
+    else:
+        host = get_default_host(None)
+    if host is None:
+        raise RuntimeError('Could not determine API server.')
+    test(get_default_host(None))