-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain_test_task-specific_sfs.py
executable file
·134 lines (124 loc) · 5.66 KB
/
train_test_task-specific_sfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import argparse
from sklearn import metrics
import pandas as pd
import numpy as np
import timeit
from read_plc_data import read_plc_data
from model_utils import train_test_model
import os
import sys
import multiprocessing
#-------------------------------------------------------------------------------
def parse_and_process_args():
description = """ Train and test task-specific SFs"""
parser = argparse.ArgumentParser(description=description)
parser.add_argument('--sfname', required=True,
choices = ['BT-Score', 'BT-Dock', 'BT-Screen',
'RF-Score', 'X-Score'],
help = """Enter the name of the scoring function you would like
to train and test.""",
type = str)
parser.add_argument('--task', required=True,
choices = ['score', 'dock', 'screen'],
help = """Choose the task for which you would like
to train and test the scoring function.""",
type = str)
parser.add_argument('--predictions_out_fname', required = False,
default=None,
help = """File name to which the PREDICTIONS of
of the task-specific SF are saved.
""",
type = str)
parser.add_argument('--performance_out_fname', required = False,
default=None,
help = """File name to which the PERFROMANCE
statistics of the task-specific SF are saved.
""",
type = str)
parser.add_argument('--n_cpus', required = False,
default=None,
help = """The number of CPU cores to use. All CPU cores will
be used if it was not assigned.
""",
type = int)
parser.add_argument("--verbose", help="increase output verbosity",
action="store_true")
args = parser.parse_args()
if args.predictions_out_fname is None:
args.predictions_out_fname = os.path.join('data', 'output', args.task,
args.sfname + '_predictions.csv')
if args.performance_out_fname is None:
args.performance_out_fname = os.path.join('data', 'output', args.task,
args.sfname + '_performance.csv')
return args
#-------------------------------------------------------------------------------
def get_ba_data(tr_df, task):
"""
This function is used to replace task-specific dependent labels
such as ligand poses RMSD values for docking with binding affinity
data. In the data frame tr_df, valid binding affinity values for the
docking task are associated with ligand poses whose RMSD = 0,
which are essentially the native conformations ('label' == 0). The rows
with BA data of positive values are the actual active ligands for
the screening task ('label' > 0).
"""
if task == 'dock':
tr_df = tr_df[tr_df['label']==0].copy()
elif task == 'screen':
tr_df = tr_df[tr_df['label'] > 0].copy()
tr_df['label'] = tr_df['ba'].copy()
return tr_df
#-------------------------------------------------------------------------------
def main():
args = parse_and_process_args()
sfname = args.sfname.lower()
task = args.task.lower()
preds_ofname = args.predictions_out_fname
perf_ofname = args.performance_out_fname
if ((sfname == 'bt-screen' and task != 'screen')
or (sfname == 'bt-dock' and task != 'dock')):
error_msg = 'ERROR: Scoring function %s is incompatible with the %sING task'
print(error_msg%(sfname.upper(), task.upper()))
print('ABORTING.')
sys.exit()
if sfname in ['bt-score', 'bt-screen', 'bt-dock']:
descriptor_sets = ['xscore', 'affiscore', 'rfscore', 'gold',
'repast', 'smina', 'chemgauss', 'autodock41',
'ligscore', 'dsx', 'cyscore', 'padel',
'nnscore', 'retest', 'ecfp', 'dpocket']
elif sfname == 'rf-score':
descriptor_sets = ['rfscore']
elif sfname == 'x-score':
descriptor_sets = ['xscore']
rem_y = sfname in ['rf-score', 'x-score']
model_params = {'n_trees': 3000, 'depth': 10,
'eta': 0.01, 'l_rate': 0.01}
tr_dpath = os.path.join('data', 'input', task, 'primary-train')
ts_dpath = os.path.join('data', 'input', task, 'core-test')
train, ftrs_formula = read_plc_data(task, descriptor_sets=descriptor_sets,
rem_y=rem_y, data_path=tr_dpath,
verbose=args.verbose)
test, ftrs_formula = read_plc_data(task, descriptor_sets=descriptor_sets,
rem_y=rem_y, data_path=ts_dpath,
verbose=args.verbose)
if ((sfname in ['rf-score', 'x-score'])
or (sfname == 'bt-score' and task != 'score')):
train = get_ba_data(train, task)
n_cpus = multiprocessing.cpu_count() if args.n_cpus is None else args.n_cpus
model_params = {'n_cpus': n_cpus}
predictions, performance = train_test_model(task, sfname, train, test, model_params)
print('\nPerformance of %s on the %sing task:'%(args.sfname, args.task))
print(performance.to_string(index=False))
if preds_ofname is not None:
if args.verbose:
print('Writing predictions to ' + preds_ofname)
predictions.to_csv(preds_ofname, index=False)
if perf_ofname is not None:
if args.verbose:
print('Writing performance statistics to ' + perf_ofname)
performance.to_csv(perf_ofname, index=False)
if __name__== '__main__':
main()