Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add feature set permutation test #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 60 additions & 13 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ RAW_DATA_DIR = join(DATA_DIR, 'raw')
OUTPUT_DIR = 'output/%s' % config['model']
FIGURES_DIR = join(OUTPUT_DIR, 'figs')
PERMUTATION_DIR = join(OUTPUT_DIR, 'permutations')
OUTCOMES_PERMUTATION_DIR = join(PERMUTATION_DIR, 'outcomes')
FEATURES_PERMUTATION_DIR = join(PERMUTATION_DIR, 'features')
MODELS_DIR = join(OUTPUT_DIR, 'models')

# Data files
Expand All @@ -35,8 +37,12 @@ MODEL_SUMMARY = join(OUTPUT_DIR, '%s-models-summary.tsv' % config['model'])

BIOMARKER_DCB_PLOT_OUTPUT = join(OUTPUT_DIR, '%s-biomarker-dcb-plot-data.json' % config['model'])

PERMUTATION_TEST_PREFIX = join(PERMUTATION_DIR, '%s-permuted-results' % config['model'])
PERMUTATION_TEST_RESULTS = join(OUTPUT_DIR, '%s-permutation-test-results.json' % config['model'])
OUTCOME_PERMUTATION_TEST_PREFIX = join(OUTCOMES_PERMUTATION_DIR, '%s-outcome-permuted-results' % config['model'])
FEATURE_PERMUTATION_TEST_PREFIX = join(FEATURES_PERMUTATION_DIR, '%s-feature-permuted-results' % config['model'])
OUTCOME_PERMUTATION_TEST_RESULTS = join(PERMUTATION_DIR, '%s-outcome-permutation-test-results.json' % config['model'])
FEATURE_PERMUTATION_TEST_RESULTS_PREFIX = join(PERMUTATION_DIR, '%s-feature-permutation-test' % config['model'])

PERMUTATION_TEST_SUMMARY = join(OUTPUT_DIR, '%s-permutation-test-summary.tsv' % config['model'])

# Plots
FIGS_PREFIX = join(FIGURES_DIR, 'fig')
Expand Down Expand Up @@ -129,8 +135,8 @@ rule summarize_models:
output:
MODEL_SUMMARY
shell:
'python summarize_models.py -rf {input.all_features} {input.excluding_features} '\
'-of {output} -efc none {params.feature_classes}'
'python summarize.py -of {output} model -efc none {params.feature_classes} '\
'-rf {input.all_features} {input.excluding_features}'

# Do follow up analysis
rule biomarkers_and_dcb:
Expand All @@ -142,7 +148,7 @@ rule biomarkers_and_dcb:
shell:
'python associate_biomarkers_with_dcb.py -ff {input.features} -rf {input.results} -o {output}'

rule map_permutation_test:
rule map_outcome_permutation_test:
input:
features=PROCESSED_FEATURES,
outcomes=PROCESSED_OUTCOMES,
Expand All @@ -155,29 +161,69 @@ rule map_permutation_test:
tol=config['tol']
threads: config['n_jobs']
output:
"%s-{index}.json" % PERMUTATION_TEST_PREFIX
"%s-{index}.json" % OUTCOME_PERMUTATION_TEST_PREFIX
shell:
'python permutation_test.py -of {output} map -ff {input.features} '\
'python permutation_test.py -of {output} -tt outcome map -ff {input.features} '\
'-fcf {input.feature_classes} -ocf {input.outcomes} '\
'-nj {params.n_jobs} -ers {params.random_seed} -prs {params.random_seed}'\
' -m {params.model} -mi {params.max_iter} -t {params.tol}'

rule reduce_permutation_test:
rule reduce_outcome_permutation_test:
input:
permutation_test_files=expand("%s-{index}.json" % OUTCOME_PERMUTATION_TEST_PREFIX, index=range(1, config['n_permutations']+1)),
results_file=MODEL_RESULTS
output:
OUTCOME_PERMUTATION_TEST_RESULTS
shell:
'python permutation_test.py -of {output} -tt outcome reduce '\
'-rf {input.results_file} -pf {input.permutation_test_files}'

rule map_feature_permutation_test:
input:
features=PROCESSED_FEATURES,
outcomes=PROCESSED_OUTCOMES,
feature_classes=PROCESSED_FEATURE_CLASSES
params:
random_seed=lambda wildcards, output: config['random_seed'] + int(wildcards['index']),
n_jobs=config['n_jobs'],
model=config['model'],
max_iter=config['max_iter'],
tol=config['tol']
threads: config['n_jobs']
output:
"%s-{feature_class}-{index}.json" % FEATURE_PERMUTATION_TEST_PREFIX
shell:
'python permutation_test.py -of {output} -tt feature map -ff {input.features} '\
'-fcf {input.feature_classes} -ocf {input.outcomes} -fc {wildcards.feature_class} '\
'-nj {params.n_jobs} -ers {params.random_seed} -prs {params.random_seed}'\
' -m {params.model} -mi {params.max_iter} -t {params.tol}'

rule reduce_feature_permutation_test:
input:
permutation_test_files=expand("%s-{index}.json" % PERMUTATION_TEST_PREFIX, index=range(1, config['n_permutations']+1)),
permutation_test_files=expand("%s-{{feature_class}}-{index}.json" % FEATURE_PERMUTATION_TEST_PREFIX, index=range(1, config['n_permutations']+1)),
results_file=MODEL_RESULTS
output:
PERMUTATION_TEST_RESULTS
'%s-{feature_class}-results.json' % FEATURE_PERMUTATION_TEST_RESULTS_PREFIX
shell:
'python permutation_test.py -of {output} reduce '\
'python permutation_test.py -of {output} -tt feature reduce '\
'-rf {input.results_file} -pf {input.permutation_test_files}'

rule summarize_permutation_test:
input:
feature=expand('%s-{feature_class}-results.json' % FEATURE_PERMUTATION_TEST_RESULTS_PREFIX, feature_class=FEATURE_CLASSES),
outcome=OUTCOME_PERMUTATION_TEST_RESULTS
output:
PERMUTATION_TEST_SUMMARY
shell:
'python summarize.py -o {output} permutation '\
'-i {input.feature} {input.outcome}'

# Make plots
rule plot:
input:
biomarkers=BIOMARKER_DCB_PLOT_OUTPUT,
results=MODEL_RESULTS,
permutation_test_results=PERMUTATION_TEST_RESULTS,
permutation_test_results=OUTCOME_PERMUTATION_TEST_RESULTS,
coefficients=MODEL_COEFFICIENTS
params:
ext=config['figure_format']
Expand All @@ -196,4 +242,5 @@ rule all:
FIG1,
FIG2,
FIG3,
MODEL_SUMMARY
MODEL_SUMMARY,
PERMUTATION_TEST_SUMMARY
7 changes: 7 additions & 0 deletions configs/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
model: 'en'
n_permutations: 100
random_seed: 12345
n_jobs: 24
figure_format: 'png'
max_iter: 10000
tol: 1e-5
59 changes: 40 additions & 19 deletions permutation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
from models import MODEL_NAMES, init_model, FEATURE_CLASSES
from i_o import getLogger

# Constants
FEATURE_TEST = 'feature'
OUTCOME_TEST = 'outcome'

################################################################################
# MAP
# OUTCOME PERMUTATION TEST
################################################################################
# Generate permuted data and train a model
def map_permutation_test(args):
Expand All @@ -19,6 +23,7 @@ def map_permutation_test(args):
# Load required modules
from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_val_predict
from metrics import compute_metrics
from sklearn.utils import check_random_state, safe_indexing

# Load the input data
X = pd.read_csv(args.feature_file, index_col=0, sep='\t')
Expand All @@ -32,8 +37,22 @@ def map_permutation_test(args):
outcome_name = y.columns[0]

# Restrict to the training columns
selected_feature_classes = set(map(str.capitalize, set(FEATURE_CLASSES) - set(args.excluded_feature_classes)))
training_cols = feature_classes['Class'].isin(selected_feature_classes).index.tolist()
logger.info('* Permuting %ss...' % args.test_type)
np.random.seed(args.permutation_random_seed)
if args.test_type == FEATURE_TEST:
training_cols = feature_classes['Class'].index.tolist()
permuted_features = feature_classes.loc[feature_classes['Class'].isin(map(str.capitalize, args.feature_classes))].index.tolist()
for f in permuted_features:
X[f] = np.random.permutation(X[f])

# Permute the outcomes
elif args.test_type == OUTCOME_TEST:
feature_class_names = set(map(str.capitalize, set(FEATURE_CLASSES) - set(args.feature_classes)))
training_cols = feature_classes.loc[feature_classes['Class'].isin(feature_class_names)].index.tolist()
y[outcome_name] = np.random.permutation(y[outcome_name])

else:
raise NotImplementedError('Test type "%s" not implemented.' % args.test_type)

############################################################################
# RUN PERMUTATION TEST
Expand All @@ -42,14 +61,9 @@ def map_permutation_test(args):
pipeline, gscv = init_model(args.model, args.n_jobs,
args.estimator_random_seed, args.max_iter, args.tol)

# Permute the outcomes
np.random.seed(args.permutation_random_seed)
y[outcome_name] = np.random.permutation(y[outcome_name])

# Convert dataframes to matrices to avoid dataframe splitting error
outer_cv = LeaveOneOut()
preds = pd.Series(cross_val_predict(estimator = gscv,
X=X.loc[:,training_cols],
preds = pd.Series(cross_val_predict(estimator = gscv, X=X.loc[:,training_cols],
y=y[outcome_name], cv=outer_cv,
n_jobs = args.n_jobs,
verbose=61 if args.verbosity > 0 else 0),
Expand All @@ -65,6 +79,7 @@ def map_permutation_test(args):
############################################################################
with open(args.output_file, 'w') as OUT:
output = {
"params": vars(args),
"var_explained": var_explained.tolist(),
"true": sub_y.tolist(),
"preds": "sub_preds",
Expand All @@ -74,9 +89,6 @@ def map_permutation_test(args):
output.update(metric_vals.items())
json.dump( output, OUT )

################################################################################
# REDUCE
################################################################################
# Read in a bunch of results on permuted data and compute significance
def reduce_permutation_test(args):
############################################################################
Expand All @@ -92,14 +104,18 @@ def reduce_permutation_test(args):

# Load permuted results files
permutation_scores = []
for permuted_results_file in args.permuted_results_files:
feature_classes = None
for i, permuted_results_file in enumerate(args.permuted_results_files):
with open(permuted_results_file, 'r') as IN:
permutation_scores.append( json.load(IN)['mse']['held-out'] )
obj = json.load(IN)
permutation_scores.append( obj['mse']['held-out'] )
if i == 0 and args.test_type == FEATURE_TEST:
feature_classes = obj.get('params').get('feature_classes')

n_permutations = len(permutation_scores)

# Compute P-value
pvalue = (1. + sum(1. for s in permutation_scores if s >= true_score))/(n_permutations + 1.)
pvalue = (1. + sum(1. for s in permutation_scores if s <= true_score))/(n_permutations + 1.)
logger.info('- No. permutations: %s' % n_permutations)
logger.info('- True score: %.5f' % true_score)
logger.info('- P-value: p < %s' % pvalue)
Expand All @@ -110,7 +126,7 @@ def reduce_permutation_test(args):
with open(args.output_file, 'w') as OUT:
output = dict(permutation_scores=permutation_scores,
true_score=true_score, n_permutations=n_permutations,
pvalue=pvalue, params=vars(args))
pvalue=pvalue, params=vars(args), feature_classes=feature_classes)
json.dump( output, OUT )

################################################################################
Expand All @@ -124,6 +140,9 @@ def get_parser():

parser.add_argument('-v', '--verbosity', type=int, required=False, default=logging.INFO)
parser.add_argument('-of', '--output_file', type=str, required=True)
parser.add_argument('-tt', '--test_type', required=True,
choices=[FEATURE_TEST, OUTCOME_TEST],
help='Type of permutation test (feature or outcome).')

# Mapping arguments
map_parser = subparser.add_parser("map")
Expand All @@ -142,8 +161,10 @@ def get_parser():
default=12345, required=False)
map_parser.add_argument('-prs', '--permutation_random_seed', type=int,
default=12345, required=False)
map_parser.add_argument('-efc', '--excluded_feature_classes', type=str, required=False, nargs='*',
default=[], choices=FEATURE_CLASSES)
map_parser.add_argument('-fc', '--feature_classes', type=str, required=False,
default=[], choices=FEATURE_CLASSES, nargs='*',
help='For outcome permutation test, these are feature classes to be excluded. '\
'For feature permutation test, these are the features to permute.')

# Reduce arguments
reduce_parser = subparser.add_parser("reduce")
Expand All @@ -159,7 +180,7 @@ def run(args):
elif args.mode == 'reduce':
reduce_permutation_test(args)
else:
raise NotImplementedError('Mode "%s" not implemented.' % args.mode)
raise NotImplementedError('Mode "%s" not implemented for permutation test.' % args.mode)

if __name__ == '__main__':
run( get_parser().parse_args(sys.argv[1:]) )
83 changes: 83 additions & 0 deletions summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python

# Load required modules
import sys, os, argparse, json, pandas as pd, numpy as np
from permutation_test import OUTCOME_TEST

# Model summary
def model_summary(args):
assert( len(args.results_files) == len(args.excluded_feature_classes))

# Load the results files to construct dataframe
items = []
for result_file, excluded_feature_class in zip(args.results_files, args.excluded_feature_classes):
with open(result_file, 'r') as IN:
obj = json.load(IN)
mses = obj['mse']
rmses = obj['rmse']
var_explained = obj['variance_explained']
maes = obj['mae']
n_features = len(obj['training_features'])
item = { "Excluded Feature Classes": excluded_feature_class.capitalize(), "No. features": n_features }
for metric_name, metric in zip(['mae', 'mse', 'rmse'], [mses, rmses, maes]):
item[metric_name.upper()] = metric['held-out']
item['Variance explained'] = var_explained
items.append(item)

# Convert to DataFrame and output to file
df = pd.DataFrame(items)
df = df.sort_values('Variance explained', ascending=False)
df = df[['Excluded Feature Classes', 'No. features', 'Variance explained', 'MSE', 'RMSE', 'MAE']]
df.to_csv(args.output_file, sep='\t', index=False)

# Permutation summary
def permutation_summary(args):
items = []
for input_file in args.input_files:
# Load the file
with open(input_file, 'r') as IN:
obj = json.load(IN)
params = obj.get('params')

#
if params.get('test_type') == OUTCOME_TEST:
feature_classes = ''
else:
feature_classes = ' '.join(map(str.capitalize, obj.get('feature_classes')))

# Extract relevant information
items.append({
'Test type': params.get('test_type').capitalize(),
'N': obj.get('n_permutations'),
'Feature class': feature_classes,
'Score': obj.get('true_score'),
'P-value': obj.get('pvalue'),
'Mean Permuted Score': np.mean(obj.get('permutation_scores'))
})

# Convert to DataFrame and save
df = pd.DataFrame(items)[['Test type', 'Feature class', 'Score', 'N', 'Mean Permuted Score', 'P-value']]
df = df.sort_values('Test type')
df.to_csv(args.output_file, sep='\t', index=False)

if __name__ == '__main__':
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-of', '--output_file', type=str, required=True)
subparser = parser.add_subparsers(dest='mode', help='Model or permutation test.')

model_parser = subparser.add_parser('model')
model_parser.add_argument('-rf', '--results_files', nargs='*', type=str, required=True)
model_parser.add_argument('-efc', '--excluded_feature_classes', type=str, nargs='*', required=True)

permute_parser = subparser.add_parser('permutation')
permute_parser.add_argument('-i', '--input_files', type=str, required=True, nargs='*')
args = parser.parse_args(sys.argv[1:])

# Call the appropriate summarizer
if args.mode == 'model':
model_summary(args)
elif args.mode == 'permutation':
permutation_summary(args)
else:
raise NotImplementedError('Mode "%s" not implemented.' % args.mode)
35 changes: 0 additions & 35 deletions summarize_models.py

This file was deleted.

Loading