diff --git a/analysis_replication/README.md b/analysis_replication/README.md index 487296a1c..d7123b46e 100644 --- a/analysis_replication/README.md +++ b/analysis_replication/README.md @@ -1,19 +1,24 @@ # Reproducible analyses -The scripts in this directory reproduce the analyses of the curated datasets. They were written as a record of usage of the dashboard web application, which provides the same results. +The scripts in this directory reproduce the analyses of the curated datasets, in the order they are mentioned in the article. -You can run them on the public demo API, +These scripts were written as a record of usage of the dashboard web application, which provides the same results. + +You can run them on the public demo API: ```sh -python reproducible_analyses/melanoma_il2.py oncopathtk.org/api +python run_all.py http://oncopathtk.org/api ``` -or from your own local instance of the application, +Or you can run them from your own local instance of the application: ```sh -python reproducible_analyses/melanoma_il2.py "127.0.0.1:8080" +python run_all.py "http://127.0.0.1:8080" ``` substituting the argument with the address of your local API server. (See *Setting up a local application instance*). +- These scripts just call the web API, and so they do not require Python package `spatialprofilingtoolbox`. +- You can alternatively store the API host in `api_host.txt` and omit the command-line argument above. +- The run result is here in [results.txt](results.txt). diff --git a/analysis_replication/accessors.py b/analysis_replication/accessors.py index db1141d78..ad0250894 100644 --- a/analysis_replication/accessors.py +++ b/analysis_replication/accessors.py @@ -1,21 +1,59 @@ """Convenience caller of HTTP methods for data access.""" -import sys - +from typing import cast import re from itertools import chain from urllib.request import urlopen from urllib.parse import urlencode from urllib.error import HTTPError import json +from os.path import exists +from time import sleep from pandas import DataFrame from pandas import concat from numpy import inf from numpy import nan +from numpy import isnan +from numpy import mean +from numpy import log +from scipy.stats import ttest_ind # type: ignore + + +def get_default_host(given: str | None) -> str | None: + if given is not None: + return given + filename = 'api_host.txt' + if exists(filename): + with open(filename, 'rt', encoding='utf-8') as file: + host = file.read().rstrip() + else: + host = None + return host + + +class Colors: + bold_green = '\u001b[32;1m' + blue = '\u001b[34m' + bold_magenta = '\u001b[35;1m' + bold_red = '\u001b[31;1m' + yellow = '\u001b[33m' + reset = '\u001b[0m' + + +def sleep_poll(): + seconds = 10 + print(f'Waiting {seconds} seconds to poll.') + sleep(seconds) + class DataAccessor: """Convenience caller of HTTP methods for data access.""" - def __init__(self, study, host='data.oncopathtk.org'): + + def __init__(self, study, host=None): + _host = get_default_host(host) + if _host is None: + raise RuntimeError('Expected host name in api_host.txt .') + host = _host use_http = False if re.search('^http://', host): use_http = True @@ -23,6 +61,7 @@ def __init__(self, study, host='data.oncopathtk.org'): self.host = host self.study = study self.use_http = use_http + print('\n' + Colors.bold_magenta + study + Colors.reset + '\n') self.cohorts = self._retrieve_cohorts() self.all_cells = self._retrieve_all_cells_counts() @@ -30,15 +69,21 @@ def counts(self, phenotype_names): if isinstance(phenotype_names, str): phenotype_names = [phenotype_names] conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names) - all_name = ' and '.join([self._name_phenotype(p) for p in phenotype_names]) + all_name = self.name_for_all_phenotypes(phenotype_names) conjunction_counts_series = self._get_counts_series(conjunction_criteria, all_name) individual_counts_series = [ self._get_counts_series(self._phenotype_criteria(name), self._name_phenotype(name)) for name in phenotype_names ] - df = concat([self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series], axis=1) + df = concat( + [self.cohorts, self.all_cells, conjunction_counts_series, *individual_counts_series], + axis=1, + ) df.replace([inf, -inf], nan, inplace=True) return df + + def name_for_all_phenotypes(self, phenotype_names): + return ' and '.join([self._name_phenotype(p) for p in phenotype_names]) def neighborhood_enrichment(self, phenotype_names): feature_class = 'neighborhood enrichment' @@ -70,12 +115,13 @@ def _one_phenotype_spatial_metric(self, phenotype_names, feature_class): parts = parts1 + [('study', self.study), ('feature_class', feature_class)] query = urlencode(parts) endpoint = 'request-spatial-metrics-computation-custom-phenotype' - response, url = self._retrieve(endpoint, query) - if response['is_pending'] is True: - print('Computation is pending. Try again soon!') - print('URL:') - print(url) - sys.exit() + + while True: + response, url = self._retrieve(endpoint, query) + if response['is_pending'] is True: + sleep_poll() + else: + break rows = [ {'sample': key, '%s, %s' % (feature_class, ' and '.join(names)): value} @@ -109,12 +155,13 @@ def _two_phenotype_spatial_metric(self, phenotype_names, feature_class): parts.append(('radius', '100')) query = urlencode(parts) endpoint = 'request-spatial-metrics-computation-custom-phenotypes' - response, url = self._retrieve(endpoint, query) - if response['is_pending'] is True: - print('Computation is pending. Try again soon!') - print('URL:') - print(url) - sys.exit() + + while True: + response, url = self._retrieve(endpoint, query) + if response['is_pending'] is True: + sleep_poll() + else: + break rows = [ {'sample': key, '%s, %s' % (feature_class, ' and '.join(names)): value} @@ -210,3 +257,103 @@ def _name_phenotype(self, phenotype): for keyword, sign in zip(['positive', 'negative'], ['+', '-']) ]).rstrip() return str(phenotype) + + def important(self, phenotype_names: str | list[str]) -> dict[str, float]: + if isinstance(phenotype_names, str): + phenotype_names = [phenotype_names] + conjunction_criteria = self._conjunction_phenotype_criteria(phenotype_names) + parts = list(chain(*[ + [(f'{keyword}_marker', channel) for channel in argument] + for keyword, argument in zip( + ['positive', 'negative'], [ + conjunction_criteria['positive_markers'], + conjunction_criteria['negative_markers'], + ]) + ])) + parts = sorted(list(set(parts))) + parts.append(('study', self.study)) + query = urlencode(parts) + phenotype_counts, _ = self._retrieve('cggnn-importance-composition', query) + return {c['specimen']: c['percentage'] for c in phenotype_counts['counts']} + + +class ExpectedQuantitativeValueError(ValueError): + """ + Raised when an expected quantitative result is significantly different from the expected value. + """ + + def __init__(self, expected: float, actual: float): + error_percent = self.error_percent(expected, actual) + if error_percent is not None: + error_percent = round(100 * error_percent) / 100 + message = f''' + Expected {expected} but got {Colors.bold_red}{actual}{Colors.reset}. Error is {error_percent}%. + ''' + super().__init__(message) + + @staticmethod + def is_error(expected: float, actual: float) -> bool: + error_percent = ExpectedQuantitativeValueError.error_percent(expected, actual) + if error_percent is None: + return True + if error_percent < 1.0: + return False + return True + + @staticmethod + def error_percent(expected: float, actual: float) -> float | None: + if actual != 0: + error_percent = abs(100 * (1 - (actual / expected))) + else: + error_percent = None + return error_percent + + +def handle_expected_actual(expected: float, actual: float | None): + _actual = cast(float, actual) + if ExpectedQuantitativeValueError.is_error(expected, _actual): + raise ExpectedQuantitativeValueError(expected, _actual) + string = str(_actual) + padded = string + ' '*(21 - len(string)) + print(Colors.bold_green + padded + Colors.reset, end='') + + +def univariate_pair_compare( + list1, + list2, + expected_fold=None, + do_log_fold: bool = False, + show_pvalue=False, +): + list1 = list(filter(lambda element: not isnan(element), list1.values)) + list2 = list(filter(lambda element: not isnan(element), list2.values)) + + mean1 = float(mean(list1)) + mean2 = float(mean(list2)) + actual = mean2 / mean1 + if expected_fold is not None: + handle_expected_actual(expected_fold, actual) + print((mean2, mean1, actual), end='') + + if do_log_fold: + _list1 = [log(e) for e in list(filter(lambda element: element != 0, list1))] + _list2 = [log(e) for e in list(filter(lambda element: element != 0, list2))] + _mean1 = float(mean(_list1)) + _mean2 = float(mean(_list2)) + log_fold = _mean2 / _mean1 + print(' log fold: ' + Colors.yellow + str(log_fold) + Colors.reset, end='') + + if show_pvalue: + if do_log_fold: + result = ttest_ind(_list1, _list2) + print( + ' p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end='' + ) + else: + result = ttest_ind(list1, list2) + print(' p-value: ' + Colors.blue + str(result.pvalue) + Colors.reset, end='') + + print('') + +def print_comparison() -> None: + pass diff --git a/analysis_replication/breast_imc.py b/analysis_replication/breast_imc.py index fc0d563ac..32aefe1bb 100644 --- a/analysis_replication/breast_imc.py +++ b/analysis_replication/breast_imc.py @@ -1,57 +1,64 @@ """Data analysis script for one dataset.""" - import sys from numpy import mean from numpy import inf from accessors import DataAccessor +from accessors import get_default_host +from accessors import univariate_pair_compare as compare + +def test(host): + study = 'Breast cancer IMC' + access = DataAccessor(study, host=host) + + KRT = { + '5': {'positive_markers': ['KRT5', 'CK'], 'negative_markers': []}, + '7': {'positive_markers': ['KRT7', 'CK'], 'negative_markers': []}, + '14': {'positive_markers': ['KRT14', 'CK'], 'negative_markers': []}, + '19': {'positive_markers': ['KRT19', 'CK'], 'negative_markers': []}, + } + + df = access.proximity([KRT['14'], KRT['7']]) + values1 = df[df['cohort'] == '1']['proximity, KRT14+ CK+ and KRT7+ CK+'] + values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+'] + # handle_expected_actual(1.6216, mean2 / mean1) + # # handle_expected_actual(1.69, mean2 / mean1) + compare(values1, values2, expected_fold=1.6216, show_pvalue=True) + + df = access.proximity([KRT['14'], KRT['5']]) + values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+'] + values3 = df[df['cohort'] == '3']['proximity, KRT14+ CK+ and KRT5+ CK+'] + # # handle_expected_actual(0.65, mean2 / mean3) + # handle_expected_actual(1.265, mean2 / mean3) + compare(values3, values2, expected_fold=1.265) + + df = access.counts([KRT['14'], KRT['7']]) + fractions = df['KRT14+ CK+ and KRT7+ CK+'] / df['all cells'] + mean0 = float(mean(fractions)) + print(mean0) + + df = access.counts([KRT['14'], KRT['5']]) + fractions = df['KRT14+ CK+ and KRT5+ CK+'] / df['all cells'] + mean0 = float(mean(fractions)) + print(mean0) + + df = access.counts([KRT['14'], KRT['19']]) + fractions = df['KRT14+ CK+'] / df['KRT19+ CK+'] + fractions = fractions[fractions != inf] + fractions1 = fractions[df['cohort'] == '1'] + fractions2 = fractions[df['cohort'] == '2'] + fractions3 = fractions[df['cohort'] == '3'] + compare(fractions3, fractions2, expected_fold=111.32, show_pvalue=True) + compare(fractions1, fractions2, expected_fold=11.39, show_pvalue=True) + -study = 'Breast cancer IMC' -if len(sys.argv) == 1: - access = DataAccessor(study) -else: - access = DataAccessor(study, host=sys.argv[1]) - - -KRT = { - '5': {'positive_markers': ['KRT5', 'CK'], 'negative_markers': []}, - '7': {'positive_markers': ['KRT7', 'CK'], 'negative_markers': []}, - '14': {'positive_markers': ['KRT14', 'CK'], 'negative_markers': []}, - '19': {'positive_markers': ['KRT19', 'CK'], 'negative_markers': []}, -} - -df = access.proximity([KRT['14'], KRT['7']]) -values1 = df[df['cohort'] == '1']['proximity, KRT14+ CK+ and KRT7+ CK+'] -values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+'] -mean1 = mean(values1) -mean2 = mean(values2) -print((mean2, mean1, mean2 / mean1)) - -df = access.proximity([KRT['14'], KRT['5']]) -values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+'] -values3 = df[df['cohort'] == '3']['proximity, KRT14+ CK+ and KRT5+ CK+'] -mean2 = mean(values2) -mean3 = mean(values3) -print((mean2, mean3, mean2 / mean3)) - -df = access.counts([KRT['14'], KRT['7']]) -fractions = df['KRT14+ CK+ and KRT7+ CK+'] / df['all cells'] -print(mean(fractions)) - - -df = access.counts([KRT['14'], KRT['5']]) -fractions = df['KRT14+ CK+ and KRT5+ CK+'] / df['all cells'] -print(mean(fractions)) - -df = access.counts([KRT['14'], KRT['19']]) -fractions = df['KRT14+ CK+'] / df['KRT19+ CK+'] -fractions = fractions[fractions != inf] -fractions1 = fractions[df['cohort'] == '1'] -fractions2 = fractions[df['cohort'] == '2'] -fractions3 = fractions[df['cohort'] == '3'] -mean1 = mean(fractions1) -mean2 = mean(fractions2) -mean3 = mean(fractions3) -print((mean2, mean3, mean2 / mean3)) -print((mean2, mean1, mean2 / mean1)) +if __name__=='__main__': + host: str | None + if len(sys.argv) == 2: + host = sys.argv[1] + else: + host = get_default_host(None) + if host is None: + raise RuntimeError('Could not determine API server.') + test(get_default_host(None)) diff --git a/analysis_replication/luad_imc.py b/analysis_replication/luad_imc.py index d14cf35f3..b4afb558b 100644 --- a/analysis_replication/luad_imc.py +++ b/analysis_replication/luad_imc.py @@ -1,57 +1,54 @@ """Data analysis script for one dataset.""" - import sys -from numpy import mean from numpy import inf from accessors import DataAccessor - -study = 'LUAD progression' -if len(sys.argv) == 1: - access = DataAccessor(study) -else: - access = DataAccessor(study, host=sys.argv[1]) - - -df = access.spatial_autocorrelation('B cell') -values1 = df[df['cohort'] == '1']['spatial autocorrelation, B cell'] -values2 = df[df['cohort'] == '2']['spatial autocorrelation, B cell'] -mean1 = mean(values1) -mean2 = mean(values2) -print((mean2, mean1, mean2 / mean1)) - -df = access.neighborhood_enrichment(['CD163+ macrophage', 'Regulatory T cell']) -values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD163+ macrophage and Regulatory T cell'] -values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD163+ macrophage and Regulatory T cell'] -mean1 = mean(values1) -mean2 = mean(values2) -print((mean2, mean1, mean2 / mean1)) - -df = access.neighborhood_enrichment(['CD163+ macrophage', 'Endothelial cell']) -values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD163+ macrophage and Endothelial cell'] -values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD163+ macrophage and Endothelial cell'] -mean1 = mean(values1) -mean2 = mean(values2) -print((mean2, mean1, mean2 / mean1)) - -klrd1 = {'positive_markers': ['KLRD1'], 'negative_markers': []} -cd14 = {'positive_markers': ['CD14'], 'negative_markers': []} -cd14_fcgr3a = {'positive_markers': ['CD14', 'FCGR3A'], 'negative_markers': []} - -df = access.counts([klrd1, cd14, cd14_fcgr3a]) -fractions = df['KLRD1+'] / df['CD14+ FCGR3A+'] -fractions = fractions[fractions != inf] -fractions1 = fractions[df['cohort'] == '1'] -fractions2 = fractions[df['cohort'] == '2'] -mean1 = mean(fractions1) -mean2 = mean(fractions2) -print((mean2, mean1, mean2 / mean1)) - -fractions = df['KLRD1+'] / df['CD14+'] -fractions = fractions[fractions != inf] -fractions1 = fractions[df['cohort'] == '1'] -fractions2 = fractions[df['cohort'] == '2'] -mean1 = mean(fractions1) -mean2 = mean(fractions2) -print((mean2, mean1, mean2 / mean1)) +from accessors import get_default_host +from accessors import univariate_pair_compare as compare + +def test(host): + study = 'LUAD progression' + access = DataAccessor(study, host=host) + + df = access.spatial_autocorrelation('B cell') + values1 = df[df['cohort'] == '1']['spatial autocorrelation, B cell'] + values2 = df[df['cohort'] == '2']['spatial autocorrelation, B cell'] + compare(values1, values2, expected_fold=1.571, show_pvalue=True) + + df = access.neighborhood_enrichment(['CD163+ macrophage', 'Regulatory T cell']) + values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD163+ macrophage and Regulatory T cell'] + values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD163+ macrophage and Regulatory T cell'] + compare(values1, values2, expected_fold=797.46, do_log_fold=True, show_pvalue=True) + + df = access.neighborhood_enrichment(['CD163+ macrophage', 'Endothelial cell']) + values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD163+ macrophage and Endothelial cell'] + values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD163+ macrophage and Endothelial cell'] + compare(values1, values2, expected_fold=9.858, do_log_fold=True, show_pvalue=True) + + klrd1 = {'positive_markers': ['KLRD1'], 'negative_markers': []} + cd14 = {'positive_markers': ['CD14'], 'negative_markers': []} + cd14_fcgr3a = {'positive_markers': ['CD14', 'FCGR3A'], 'negative_markers': []} + + df = access.counts([klrd1, cd14, cd14_fcgr3a]) + fractions = df['KLRD1+'] / df['CD14+ FCGR3A+'] + fractions = fractions[fractions != inf] + fractions1 = fractions[df['cohort'] == '1'] + fractions2 = fractions[df['cohort'] == '2'] + compare(fractions1, fractions2, expected_fold=4.56, show_pvalue=True) + + fractions = df['KLRD1+'] / df['CD14+'] + fractions = fractions[fractions != inf] + fractions1 = fractions[df['cohort'] == '1'] + fractions2 = fractions[df['cohort'] == '2'] + compare(fractions1, fractions2, expected_fold=3.78, show_pvalue=True) + +if __name__=='__main__': + host: str | None + if len(sys.argv) == 2: + host = sys.argv[1] + else: + host = get_default_host(None) + if host is None: + raise RuntimeError('Could not determine API server.') + test(host) diff --git a/analysis_replication/melanoma_ici.py b/analysis_replication/melanoma_ici.py index 7a181de3b..7507cadb9 100644 --- a/analysis_replication/melanoma_ici.py +++ b/analysis_replication/melanoma_ici.py @@ -1,51 +1,43 @@ """Data analysis script for one dataset.""" - import sys from numpy import mean from accessors import DataAccessor - -study = 'Melanoma CyTOF ICI' -if len(sys.argv) == 1: - access = DataAccessor(study) -else: - access = DataAccessor(study, host=sys.argv[1]) - -antigen_experienced_cytotoxic = {'positive_markers': ['CD8A', 'CD3', 'CD45RO'], 'negative_markers': []} - -# The average value of the neighborhood enrichment score for phenotype(s) CD3+ CD45RO+ CD8A+ and -# Melanoma is 1.39 times higher in cohort 1 than in cohort 2. -df = access.neighborhood_enrichment([antigen_experienced_cytotoxic, 'Melanoma']) -print(df) -values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD8A+ CD3+ CD45RO+ and Melanoma'] -values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD8A+ CD3+ CD45RO+ and Melanoma'] -mean1 = mean(values1) -mean2 = mean(values2) -print((mean1, mean2, mean1 / mean2)) - -print('') - -# The average value of the co-occurrence score for phenotype(s) CD3+ CD45RO+ CD8A+ and Melanoma -# is 1.13 times higher in cohort 1 than in cohort 2. -df = access.co_occurrence([antigen_experienced_cytotoxic, 'Melanoma']) -print(df) -values1 = df[df['cohort'] == '1']['co-occurrence, CD8A+ CD3+ CD45RO+ and Melanoma'] -values2 = df[df['cohort'] == '2']['co-occurrence, CD8A+ CD3+ CD45RO+ and Melanoma'] -mean1 = mean(values1) -mean2 = mean(values2) -print((mean1, mean2, mean1 / mean2)) - -print('') - -# On average, the fraction of cells that are CD8A+ CD3+ CD45RO+ and MKI67+ is 1.41 times higher -# in cohort 1 than in cohort 2. -proliferative = {'positive_markers': ['MKI67'], 'negative_markers': []} -df = access.counts([antigen_experienced_cytotoxic, proliferative]) -print(df) -fractions = df['CD8A+ CD3+ CD45RO+ and MKI67+'] / df['all cells'] -fractions1 = fractions[df['cohort'] == '1'] -fractions2 = fractions[df['cohort'] == '2'] -mean1 = mean(fractions1) -mean2 = mean(fractions2) -print((mean1, mean2, mean1 / mean2)) +from accessors import get_default_host +from accessors import univariate_pair_compare as compare + +def test(host): + study = 'Melanoma CyTOF ICI' + access = DataAccessor(study, host=host) + + antigen_experienced_cytotoxic = {'positive_markers': ['CD8A', 'CD3', 'CD45RO'], 'negative_markers': []} + + # The average value of the neighborhood enrichment score for phenotype(s) CD3+ CD45RO+ CD8A+ and + # Melanoma is 1.39 times higher in cohort 1 than in cohort 2. + df = access.neighborhood_enrichment([antigen_experienced_cytotoxic, 'Melanoma']) + values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD8A+ CD3+ CD45RO+ and Melanoma'] + values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD8A+ CD3+ CD45RO+ and Melanoma'] + # handle_expected_actual(1.234, mean1 / mean2) + # # handle_expected_actual(1.39, mean1 / mean2) + compare(values2, values1, expected_fold=1.234, do_log_fold=True) + + # On average, the fraction of cells that are CD8A+ CD3+ CD45RO+ and MKI67+ is 1.41 times higher + # in cohort 1 than in cohort 2. + proliferative = {'positive_markers': ['MKI67'], 'negative_markers': []} + df = access.counts([antigen_experienced_cytotoxic, proliferative]) + fractions = df['CD8A+ CD3+ CD45RO+ and MKI67+'] / df['all cells'] + fractions1 = fractions[df['cohort'] == '1'] + fractions2 = fractions[df['cohort'] == '2'] + compare(fractions2, fractions1, expected_fold=1.41) + + +if __name__=='__main__': + host: str | None + if len(sys.argv) == 2: + host = sys.argv[1] + else: + host = get_default_host(None) + if host is None: + raise RuntimeError('Could not determine API server.') + test(host) diff --git a/analysis_replication/melanoma_il2.py b/analysis_replication/melanoma_il2.py index 1122fc7d6..2fc75a363 100644 --- a/analysis_replication/melanoma_il2.py +++ b/analysis_replication/melanoma_il2.py @@ -1,71 +1,96 @@ -"""Data analysis script for one dataset.""" +"""Data analysis script for one the "Melanoma intralesional IL2" study.""" import sys +from pandas import DataFrame, Series -from numpy import mean +from scipy.stats import fisher_exact from accessors import DataAccessor +from accessors import get_default_host +from accessors import univariate_pair_compare as compare +from accessors import handle_expected_actual -study = 'Melanoma intralesional IL2' -if len(sys.argv) == 1: - access = DataAccessor(study) -else: - host = sys.argv[1] + +def test(host): + study = 'Melanoma intralesional IL2' access = DataAccessor(study, host=host) -exhausted = {'positive_markers': ['KI67', 'PD1', 'LAG3', 'TIM3'], 'negative_markers': []} -df = access.counts(['CD8+ T cell', exhausted]) -print(df) + exhausted = {'positive_markers': ['KI67', 'PD1', 'LAG3', 'TIM3'], 'negative_markers': []} + df = access.counts(['CD8+ T cell', exhausted]) + + # On average, the fraction of cells that are CD8+ T cells and KI67+ LAG3+ PD1+ TIM3+ is 24.51 + # times higher in cohort 3 than in cohort 1. + fractions = df['CD8+ T cell and KI67+ PD1+ LAG3+ TIM3+'] / df['all cells'] + fractions1 = fractions[df['cohort'] == '1'] + fractions3 = fractions[df['cohort'] == '3'] + compare(fractions1, fractions3, expected_fold=24.51, show_pvalue=True) + + # On average, the ratio of the number of cells that are CD8+ T cells and KI67+ LAG3+ PD1+ TIM3+ + # to those that are CD8+ T cells is 6.29 times higher in cohort 3 than in cohort 1. + fractions = df['CD8+ T cell and KI67+ PD1+ LAG3+ TIM3+'] / df['CD8+ T cell'] + fractions1 = fractions[df['cohort'] == '1'] + fractions3 = fractions[df['cohort'] == '3'] + compare(fractions1, fractions3, expected_fold=6.29, show_pvalue=True) -# On average, the fraction of cells that are CD8+ T cells and KI67+ LAG3+ PD1+ TIM3+ is 24.51 -# times higher in cohort 3 than in cohort 1. -fractions = df['CD8+ T cell and KI67+ PD1+ LAG3+ TIM3+'] / df['all cells'] -fractions1 = fractions[df['cohort'] == '1'] -fractions3 = fractions[df['cohort'] == '3'] -mean1 = mean(fractions1) -mean3 = mean(fractions3) -print((mean3, mean1, mean3 / mean1)) + mhci = {'positive_markers': ['MHCI'], 'negative_markers': []} + df = access.counts(['Tumor', mhci]) -# On average, the ratio of the number of cells that are CD8+ T cells and KI67+ LAG3+ PD1+ TIM3+ -# to those that are CD8+ T cells is 6.29 times higher in cohort 3 than in cohort 1. -fractions = df['CD8+ T cell and KI67+ PD1+ LAG3+ TIM3+'] / df['CD8+ T cell'] -fractions1 = fractions[df['cohort'] == '1'] -fractions3 = fractions[df['cohort'] == '3'] -mean1 = mean(fractions1) -mean3 = mean(fractions3) -print((mean3, mean1, mean3 / mean1)) + # On average, the ratio of the number of cells that are MHCI+ and Tumor to those that are Tumor + # is 1.86 times higher in cohort 3 than in cohort 1. + fractions = df['Tumor and MHCI+'] / df['Tumor'] + fractions1 = fractions[df['cohort'] == '1'] + fractions3 = fractions[df['cohort'] == '3'] + compare(fractions1, fractions3, expected_fold=1.86) -print('') + # The average value of the proximity score for phenotype(s) B cells is 3.59 times higher in + # cohort 3 than in cohort 1. + df = access.proximity(['B cell', 'B cell']) + values1 = df[df['cohort'] == '1']['proximity, B cell and B cell'] + values3 = df[df['cohort'] == '3']['proximity, B cell and B cell'] + compare(values1, values3, expected_fold=3.59) -mhci = {'positive_markers': ['MHCI'], 'negative_markers': []} -df = access.counts(['Tumor', mhci]) -print(df) + # The average value of the neighborhood enrichment score for phenotype(s) B cells is 80.45 times + # higher in cohort 1 than in cohort 3. + df = access.neighborhood_enrichment(['B cell', 'B cell']) + values1 = df[df['cohort'] == '1']['neighborhood enrichment, B cell and B cell'] + values3 = df[df['cohort'] == '3']['neighborhood enrichment, B cell and B cell'] + compare(values3, values1, expected_fold=80.45, do_log_fold=True) -# On average, the ratio of the number of cells that are MHCI+ and Tumor to those that are Tumor -# is 1.86 times higher in cohort 3 than in cohort 1. -fractions = df['Tumor and MHCI+'] / df['Tumor'] -fractions1 = fractions[df['cohort'] == '1'] -fractions3 = fractions[df['cohort'] == '3'] -mean1 = mean(fractions1) -mean3 = mean(fractions3) -print((mean3, mean1, mean3 / mean1)) + for phenotype, expected_baseline, expected_percentage, expected_p in [ + ('Adipocyte or Langerhans cell', 3.593e-2, 80, 6e-34), + ([{'positive_markers': ['S100B'], 'negative_markers': ['SOX10']}], 6.710e-2, 82, 2.783e-21) + ]: + result_df = DataFrame(columns=['odd ratio', 'p-value']) + result_df.index.name = 'specimen' + df = access.counts(phenotype) + df = df[df['cohort'].isin({'1', '3'})] + important_proportion = access.important(phenotype) + if (type(phenotype) is list): + phenotype = access.name_for_all_phenotypes(phenotype) + for specimen, row in df.iterrows(): + n_cells_of_this_phenotype = row[phenotype][0] + n_cells_total = row['all cells'] + p_important = important_proportion[specimen] + odd_ratio, p_value = fisher_exact([ + [n_cells_of_this_phenotype, p_important], + [n_cells_total, 100], + ]) + result_df.loc[specimen] = [odd_ratio, p_value] + print(f'\nBaseline presence of {phenotype}') + handle_expected_actual(expected_baseline, (df[phenotype].iloc[:, 0]/df['all cells']).mean()) + print(f'\nPercentage of top 100 most important cells') + handle_expected_actual(expected_percentage, Series(important_proportion).mean()) + print(f'\nLargest p-value from fisher exact test across all specimens') + handle_expected_actual(expected_p, result_df['p-value'].max()) + print('') -# The average value of the proximity score for phenotype(s) B cells is 3.59 times higher in -# cohort 3 than in cohort 1. -df = access.proximity(['B cell', 'B cell']) -print(df) -values1 = df[df['cohort'] == '1']['proximity, B cell and B cell'] -values3 = df[df['cohort'] == '3']['proximity, B cell and B cell'] -mean1 = mean(values1) -mean3 = mean(values3) -print((mean3, mean1, mean3 / mean1)) -# The average value of the neighborhood enrichment score for phenotype(s) B cells is 80.45 times -# higher in cohort 1 than in cohort 3. -df = access.neighborhood_enrichment(['B cell', 'B cell']) -print(df) -values1 = df[df['cohort'] == '1']['neighborhood enrichment, B cell and B cell'] -values3 = df[df['cohort'] == '3']['neighborhood enrichment, B cell and B cell'] -mean1 = mean(values1) -mean3 = mean(values3) -print((mean1, mean3, mean1 / mean3)) +if __name__ == '__main__': + host: str | None + if len(sys.argv) == 2: + host = sys.argv[1] + else: + host = get_default_host(None) + if host is None: + raise RuntimeError('Could not determine API server.') + test(host) diff --git a/analysis_replication/results.txt b/analysis_replication/results.txt new file mode 100644 index 000000000..43d3f1e74 --- /dev/null +++ b/analysis_replication/results.txt @@ -0,0 +1,29 @@ +Melanoma intralesional IL2 + +24.511033976415323 (0.010047670356693218, 0.0004099243779907919, 24.511033976415323) p-value: 0.05666978568568671 +6.294433588209584 (0.047486682117734144, 0.007544234354411779, 6.294433588209584) p-value: 0.03268081246064705 +1.8632253173760547 (0.7741293927288532, 0.4154781418594317, 1.8632253173760547) +3.591943278657352 (6.546755795065627, 1.822622265213711, 3.591943278657352) +80.44603390109388 (0.13545300354857456, 0.0016837747864004108, 80.44603390109388) log fold: 0.3918886790459368 + +Melanoma CyTOF ICI + +1.2344305898477363 (0.579526623254723, 0.46946878019784494, 1.2344305898477363) log fold: 1.22996178379338 +1.4110054329515245 (0.05109818481900941, 0.0362140241459757, 1.4110054329515245) + +Breast cancer IMC + +1.6215579838416214 (53.51380710437757, 33.00147613445089, 1.6215579838416214) p-value: 0.03558768297763137 +1.2652227187661162 (13.995947353340513, 11.062042394393446, 1.2652227187661162) +0.008058344647539068 +0.009062759768070526 +111.31691959795391 (5.236077375766945, 0.047037569802310525, 111.31691959795391) p-value: 0.5115235822483603 +11.39823595697709 (5.236077375766945, 0.4593761171053699, 11.39823595697709) p-value: 0.4816762958394687 + +LUAD progression + +1.5708529575671681 (0.3472407606181818, 0.22105236454210525, 1.5708529575671681) p-value: 0.11338628422238624 +797.4567230612785 (0.013456819020965162, 1.687466997495124e-05, 797.4567230612785) log fold: 0.8493007250260091 p-value (after log): 0.07096354161594853 +9.857655019450066 (0.02504683964315889, 0.002540851713083807, 9.857655019450066) log fold: 0.8154636349597435 p-value (after log): 0.05431382849024341 +4.5446431417428945 (1.2854687895449681, 0.28285362556585336, 4.5446431417428945) p-value: 0.06383320899190638 +3.785680226753365 (0.32701237710297304, 0.08638140506215496, 3.785680226753365) p-value: 0.06481355368930636 diff --git a/analysis_replication/run_all.py b/analysis_replication/run_all.py new file mode 100644 index 000000000..32bb4eb77 --- /dev/null +++ b/analysis_replication/run_all.py @@ -0,0 +1,22 @@ +import sys + +from accessors import get_default_host + +from melanoma_il2 import test as test1 +from melanoma_ici import test as test2 +from breast_imc import test as test3 +from luad_imc import test as test4 + + +if __name__=='__main__': + host: str | None + if len(sys.argv) == 2: + host = sys.argv[1] + else: + host = get_default_host(None) + if host is None: + raise RuntimeError('Could not determine API server.') + else: + print('Using host: ' + str(host)) + for test in [test1, test2, test3, test4]: + test(host) diff --git a/build/ondemand/Dockerfile b/build/ondemand/Dockerfile index 03475677e..1f9f16333 100644 --- a/build/ondemand/Dockerfile +++ b/build/ondemand/Dockerfile @@ -22,4 +22,4 @@ ENV service_name $service_name COPY $WHEEL_FILENAME ./ RUN python -m pip install "$WHEEL_FILENAME" ENV API_SERVER_PORT=8016 -ENTRYPOINT ["spt", "ondemand", "start", "--host", "0.0.0.0", "--port", "8016", "--timeout-seconds", "120"] +ENTRYPOINT ["spt", "ondemand", "start", "--host", "0.0.0.0", "--port", "8016", "--timeout-seconds", "360"]