lrgr · mdml · Feb 1, 2018 · Feb 2, 2018
diff --git a/Snakefile b/Snakefile
@@ -11,6 +11,8 @@ RAW_DATA_DIR = join(DATA_DIR, 'raw')
 OUTPUT_DIR = 'output/%s' % config['model']
 FIGURES_DIR = join(OUTPUT_DIR, 'figs')
 PERMUTATION_DIR = join(OUTPUT_DIR, 'permutations')
+OUTCOMES_PERMUTATION_DIR = join(PERMUTATION_DIR, 'outcomes')
+FEATURES_PERMUTATION_DIR = join(PERMUTATION_DIR, 'features')
 MODELS_DIR = join(OUTPUT_DIR, 'models')
 
 # Data files
@@ -35,8 +37,12 @@ MODEL_SUMMARY = join(OUTPUT_DIR, '%s-models-summary.tsv' % config['model'])
 
 BIOMARKER_DCB_PLOT_OUTPUT = join(OUTPUT_DIR, '%s-biomarker-dcb-plot-data.json' % config['model'])
 
-PERMUTATION_TEST_PREFIX = join(PERMUTATION_DIR, '%s-permuted-results' % config['model'])
-PERMUTATION_TEST_RESULTS = join(OUTPUT_DIR, '%s-permutation-test-results.json' % config['model'])
+OUTCOME_PERMUTATION_TEST_PREFIX = join(OUTCOMES_PERMUTATION_DIR, '%s-outcome-permuted-results' % config['model'])
+FEATURE_PERMUTATION_TEST_PREFIX = join(FEATURES_PERMUTATION_DIR, '%s-feature-permuted-results' % config['model'])
+OUTCOME_PERMUTATION_TEST_RESULTS = join(PERMUTATION_DIR, '%s-outcome-permutation-test-results.json' % config['model'])
+FEATURE_PERMUTATION_TEST_RESULTS_PREFIX = join(PERMUTATION_DIR, '%s-feature-permutation-test' % config['model'])
+
+PERMUTATION_TEST_SUMMARY = join(OUTPUT_DIR, '%s-permutation-test-summary.tsv' % config['model'])
 
 # Plots
 FIGS_PREFIX = join(FIGURES_DIR, 'fig')
@@ -129,8 +135,8 @@ rule summarize_models:
     output:
         MODEL_SUMMARY
     shell:
-        'python summarize_models.py -rf {input.all_features} {input.excluding_features} '\
-        '-of {output} -efc none {params.feature_classes}'
+        'python summarize.py -of {output} model -efc none {params.feature_classes} '\
+        '-rf {input.all_features} {input.excluding_features}'
 
 # Do follow up analysis
 rule biomarkers_and_dcb:
@@ -142,7 +148,7 @@ rule biomarkers_and_dcb:
     shell:
         'python associate_biomarkers_with_dcb.py -ff {input.features} -rf {input.results} -o {output}'
 
-rule map_permutation_test:
+rule map_outcome_permutation_test:
     input:
         features=PROCESSED_FEATURES,
         outcomes=PROCESSED_OUTCOMES,
@@ -155,29 +161,69 @@ rule map_permutation_test:
         tol=config['tol']
     threads: config['n_jobs']
     output:
-        "%s-{index}.json" % PERMUTATION_TEST_PREFIX
+        "%s-{index}.json" % OUTCOME_PERMUTATION_TEST_PREFIX
     shell:
-        'python permutation_test.py -of {output} map -ff {input.features} '\
+        'python permutation_test.py -of {output} -tt outcome map -ff {input.features} '\
         '-fcf {input.feature_classes} -ocf {input.outcomes}  '\
         '-nj {params.n_jobs} -ers {params.random_seed} -prs {params.random_seed}'\
         ' -m {params.model} -mi {params.max_iter} -t {params.tol}'
 
-rule reduce_permutation_test:
+rule reduce_outcome_permutation_test:
+    input:
+        permutation_test_files=expand("%s-{index}.json" % OUTCOME_PERMUTATION_TEST_PREFIX, index=range(1, config['n_permutations']+1)),
+        results_file=MODEL_RESULTS
+    output:
+        OUTCOME_PERMUTATION_TEST_RESULTS
+    shell:
+        'python permutation_test.py -of {output} -tt outcome reduce '\
+        '-rf {input.results_file} -pf {input.permutation_test_files}'
+
+rule map_feature_permutation_test:
+    input:
+        features=PROCESSED_FEATURES,
+        outcomes=PROCESSED_OUTCOMES,
+        feature_classes=PROCESSED_FEATURE_CLASSES
+    params:
+        random_seed=lambda wildcards, output: config['random_seed'] + int(wildcards['index']),
+        n_jobs=config['n_jobs'],
+        model=config['model'],
+        max_iter=config['max_iter'],
+        tol=config['tol']
+    threads: config['n_jobs']
+    output:
+        "%s-{feature_class}-{index}.json" % FEATURE_PERMUTATION_TEST_PREFIX
+    shell:
+        'python permutation_test.py -of {output} -tt feature map -ff {input.features} '\
+        '-fcf {input.feature_classes} -ocf {input.outcomes} -fc {wildcards.feature_class} '\
+        '-nj {params.n_jobs} -ers {params.random_seed} -prs {params.random_seed}'\
+        ' -m {params.model} -mi {params.max_iter} -t {params.tol}'
+
+rule reduce_feature_permutation_test:
     input:
-        permutation_test_files=expand("%s-{index}.json" % PERMUTATION_TEST_PREFIX, index=range(1, config['n_permutations']+1)),
+        permutation_test_files=expand("%s-{{feature_class}}-{index}.json" % FEATURE_PERMUTATION_TEST_PREFIX, index=range(1, config['n_permutations']+1)),
         results_file=MODEL_RESULTS
     output:
-        PERMUTATION_TEST_RESULTS
+        '%s-{feature_class}-results.json' % FEATURE_PERMUTATION_TEST_RESULTS_PREFIX
     shell:
-        'python permutation_test.py -of {output} reduce '\
+        'python permutation_test.py -of {output} -tt feature reduce '\
         '-rf {input.results_file} -pf {input.permutation_test_files}'
 
+rule summarize_permutation_test:
+    input:
+        feature=expand('%s-{feature_class}-results.json' % FEATURE_PERMUTATION_TEST_RESULTS_PREFIX, feature_class=FEATURE_CLASSES),
+        outcome=OUTCOME_PERMUTATION_TEST_RESULTS
+    output:
+        PERMUTATION_TEST_SUMMARY
+    shell:
+        'python summarize.py -o {output} permutation '\
+        '-i {input.feature} {input.outcome}'
+
 # Make plots
 rule plot:
     input:
         biomarkers=BIOMARKER_DCB_PLOT_OUTPUT,
         results=MODEL_RESULTS,
-        permutation_test_results=PERMUTATION_TEST_RESULTS,
+        permutation_test_results=OUTCOME_PERMUTATION_TEST_RESULTS,
         coefficients=MODEL_COEFFICIENTS
     params:
         ext=config['figure_format']
@@ -196,4 +242,5 @@ rule all:
         FIG1,
         FIG2,
         FIG3,
-        MODEL_SUMMARY
+        MODEL_SUMMARY,
+        PERMUTATION_TEST_SUMMARY
diff --git a/configs/test.yml b/configs/test.yml
@@ -0,0 +1,7 @@
+model: 'en'
+n_permutations: 100
+random_seed: 12345
+n_jobs: 24
+figure_format: 'png'
+max_iter: 10000
+tol: 1e-5
diff --git a/permutation_test.py b/permutation_test.py
@@ -8,8 +8,12 @@
 from models import MODEL_NAMES, init_model, FEATURE_CLASSES
 from i_o import getLogger
 
+# Constants
+FEATURE_TEST = 'feature'
+OUTCOME_TEST = 'outcome'
+
 ################################################################################
-# MAP
+# OUTCOME PERMUTATION TEST
 ################################################################################
 # Generate permuted data and train a model
 def map_permutation_test(args):
@@ -19,6 +23,7 @@ def map_permutation_test(args):
     # Load required modules
     from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_val_predict
     from metrics import compute_metrics
+    from sklearn.utils import check_random_state, safe_indexing
 
     # Load the input data
     X = pd.read_csv(args.feature_file, index_col=0, sep='\t')
@@ -32,8 +37,22 @@ def map_permutation_test(args):
     outcome_name = y.columns[0]
 
     # Restrict to the training columns
-    selected_feature_classes = set(map(str.capitalize, set(FEATURE_CLASSES) - set(args.excluded_feature_classes)))
-    training_cols = feature_classes['Class'].isin(selected_feature_classes).index.tolist()
+    logger.info('* Permuting %ss...' % args.test_type)
+    np.random.seed(args.permutation_random_seed)
+    if args.test_type == FEATURE_TEST:
+        training_cols = feature_classes['Class'].index.tolist()
+        permuted_features = feature_classes.loc[feature_classes['Class'].isin(map(str.capitalize, args.feature_classes))].index.tolist()
+        for f in permuted_features:
+            X[f] = np.random.permutation(X[f])
+
+    # Permute the outcomes
+    elif args.test_type == OUTCOME_TEST:
+        feature_class_names = set(map(str.capitalize, set(FEATURE_CLASSES) - set(args.feature_classes)))
+        training_cols = feature_classes.loc[feature_classes['Class'].isin(feature_class_names)].index.tolist()
+        y[outcome_name] = np.random.permutation(y[outcome_name])
+
+    else:
+        raise NotImplementedError('Test type "%s" not implemented.' % args.test_type)
 
     ############################################################################
     # RUN PERMUTATION TEST
@@ -42,14 +61,9 @@ def map_permutation_test(args):
     pipeline, gscv = init_model(args.model, args.n_jobs,
         args.estimator_random_seed, args.max_iter, args.tol)
 
-    # Permute the outcomes
-    np.random.seed(args.permutation_random_seed)
-    y[outcome_name] = np.random.permutation(y[outcome_name])
-
     # Convert dataframes to matrices to avoid dataframe splitting error
     outer_cv = LeaveOneOut()
-    preds = pd.Series(cross_val_predict(estimator = gscv,
-                                       X=X.loc[:,training_cols],
+    preds = pd.Series(cross_val_predict(estimator = gscv, X=X.loc[:,training_cols],
                                         y=y[outcome_name], cv=outer_cv,
                                         n_jobs = args.n_jobs,
                                         verbose=61 if args.verbosity > 0 else 0),
@@ -65,6 +79,7 @@ def map_permutation_test(args):
     ############################################################################
     with open(args.output_file, 'w') as OUT:
         output = {
+            "params": vars(args),
             "var_explained": var_explained.tolist(),
             "true": sub_y.tolist(),
             "preds": "sub_preds",
@@ -74,9 +89,6 @@ def map_permutation_test(args):
         output.update(metric_vals.items())
         json.dump( output, OUT )
 
-################################################################################
-# REDUCE
-################################################################################
 # Read in a bunch of results on permuted data and compute significance
 def reduce_permutation_test(args):
     ############################################################################
@@ -92,14 +104,18 @@ def reduce_permutation_test(args):
 
     # Load permuted results files
     permutation_scores = []
-    for permuted_results_file in args.permuted_results_files:
+    feature_classes = None
+    for i, permuted_results_file in enumerate(args.permuted_results_files):
         with open(permuted_results_file, 'r') as IN:
-            permutation_scores.append( json.load(IN)['mse']['held-out'] )
+            obj = json.load(IN)
+            permutation_scores.append( obj['mse']['held-out'] )
+            if i == 0 and args.test_type == FEATURE_TEST:
+                feature_classes = obj.get('params').get('feature_classes')
 
     n_permutations = len(permutation_scores)
 
     # Compute P-value
-    pvalue = (1. + sum(1. for s in permutation_scores if s >= true_score))/(n_permutations + 1.)
+    pvalue = (1. + sum(1. for s in permutation_scores if s <= true_score))/(n_permutations + 1.)
     logger.info('- No. permutations: %s' % n_permutations)
     logger.info('- True score: %.5f' % true_score)
     logger.info('- P-value: p < %s' % pvalue)
@@ -110,7 +126,7 @@ def reduce_permutation_test(args):
     with open(args.output_file, 'w') as OUT:
         output = dict(permutation_scores=permutation_scores,
                       true_score=true_score, n_permutations=n_permutations,
-                      pvalue=pvalue, params=vars(args))
+                      pvalue=pvalue, params=vars(args), feature_classes=feature_classes)
         json.dump( output, OUT )
 
 ################################################################################
@@ -124,6 +140,9 @@ def get_parser():
 
     parser.add_argument('-v', '--verbosity', type=int, required=False, default=logging.INFO)
     parser.add_argument('-of', '--output_file', type=str, required=True)
+    parser.add_argument('-tt', '--test_type', required=True,
+        choices=[FEATURE_TEST, OUTCOME_TEST],
+        help='Type of permutation test (feature or outcome).')
 
     # Mapping arguments
     map_parser = subparser.add_parser("map")
@@ -142,8 +161,10 @@ def get_parser():
         default=12345, required=False)
     map_parser.add_argument('-prs', '--permutation_random_seed', type=int,
         default=12345, required=False)
-    map_parser.add_argument('-efc', '--excluded_feature_classes', type=str, required=False, nargs='*',
-        default=[], choices=FEATURE_CLASSES)
+    map_parser.add_argument('-fc', '--feature_classes', type=str, required=False,
+        default=[], choices=FEATURE_CLASSES, nargs='*',
+        help='For outcome permutation test, these are feature classes to be excluded. '\
+             'For feature permutation test, these are the features to permute.')
 
     # Reduce arguments
     reduce_parser = subparser.add_parser("reduce")
@@ -159,7 +180,7 @@ def run(args):
     elif args.mode == 'reduce':
         reduce_permutation_test(args)
     else:
-        raise NotImplementedError('Mode "%s" not implemented.' % args.mode)
+        raise NotImplementedError('Mode "%s" not implemented for permutation test.' % args.mode)
 
 if __name__ == '__main__':
     run( get_parser().parse_args(sys.argv[1:]) )
diff --git a/summarize.py b/summarize.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+# Load required modules
+import sys, os, argparse, json, pandas as pd, numpy as np
+from permutation_test import OUTCOME_TEST
+
+# Model summary
+def model_summary(args):
+    assert( len(args.results_files) == len(args.excluded_feature_classes))
+
+    # Load the results files to construct dataframe
+    items = []
+    for result_file, excluded_feature_class in zip(args.results_files, args.excluded_feature_classes):
+        with open(result_file, 'r') as IN:
+            obj = json.load(IN)
+            mses = obj['mse']
+            rmses = obj['rmse']
+            var_explained = obj['variance_explained']
+            maes = obj['mae']
+            n_features = len(obj['training_features'])
+            item = { "Excluded Feature Classes": excluded_feature_class.capitalize(), "No. features": n_features }
+            for metric_name, metric in zip(['mae', 'mse', 'rmse'], [mses, rmses, maes]):
+                item[metric_name.upper()] = metric['held-out']
+            item['Variance explained'] = var_explained
+            items.append(item)
+
+    # Convert to DataFrame and output to file
+    df = pd.DataFrame(items)
+    df = df.sort_values('Variance explained', ascending=False)
+    df = df[['Excluded Feature Classes', 'No. features', 'Variance explained', 'MSE', 'RMSE', 'MAE']]
+    df.to_csv(args.output_file, sep='\t', index=False)
+
+# Permutation summary
+def permutation_summary(args):
+    items = []
+    for input_file in args.input_files:
+        # Load the file
+        with open(input_file, 'r') as IN:
+            obj = json.load(IN)
+            params = obj.get('params')
+
+            #
+            if params.get('test_type') == OUTCOME_TEST:
+                feature_classes = ''
+            else:
+                feature_classes = ' '.join(map(str.capitalize, obj.get('feature_classes')))
+
+        # Extract relevant information
+        items.append({
+            'Test type': params.get('test_type').capitalize(),
+            'N': obj.get('n_permutations'),
+            'Feature class': feature_classes,
+            'Score': obj.get('true_score'),
+            'P-value': obj.get('pvalue'),
+            'Mean Permuted Score': np.mean(obj.get('permutation_scores'))
+        })
+
+    # Convert to DataFrame and save
+    df = pd.DataFrame(items)[['Test type', 'Feature class', 'Score', 'N', 'Mean Permuted Score', 'P-value']]
+    df = df.sort_values('Test type')
+    df.to_csv(args.output_file, sep='\t', index=False)
+
+if __name__ == '__main__':
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-of', '--output_file', type=str, required=True)
+    subparser = parser.add_subparsers(dest='mode', help='Model or permutation test.')
+
+    model_parser = subparser.add_parser('model')
+    model_parser.add_argument('-rf', '--results_files', nargs='*', type=str, required=True)
+    model_parser.add_argument('-efc', '--excluded_feature_classes', type=str, nargs='*', required=True)
+
+    permute_parser = subparser.add_parser('permutation')
+    permute_parser.add_argument('-i', '--input_files', type=str, required=True, nargs='*')
+    args = parser.parse_args(sys.argv[1:])
+
+    # Call the appropriate summarizer
+    if args.mode == 'model':
+        model_summary(args)
+    elif args.mode == 'permutation':
+        permutation_summary(args)
+    else:
+        raise NotImplementedError('Mode "%s" not implemented.' % args.mode)
diff --git a/summarize_models.py b/summarize_models.py