-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathdataset_selection.py
81 lines (64 loc) · 3.19 KB
/
dataset_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
import argparse as ap
import pandas as pd
import sys
def read_params(args):
parser = ap.ArgumentParser(description='Select specific dataset from input dataset file')
arg = parser.add_argument
arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=sys.stdin, type=str, help="the input dataset file [stdin if not present]")
arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output dataset file")
arg( '-z','--feature_identifier', type=str, default='k__', help="the feature identifier\n")
arg( '-s','--select', type=str, help="the samples to select\n")
arg( '-r','--remove', type=str, help="the samples to remove\n")
arg( '-i','--include', type=str, help="the fields to include\n")
arg( '-e','--exclude', type=str, help="the fields to exclude\n")
arg( '-t','--tout', action='store_true', help="transpose output dataset file\n")
return vars(parser.parse_args())
def main(args):
par = read_params(args)
f = pd.read_csv(par['inp_f'], sep='\t', header=None, index_col=0, dtype='unicode')
f = f.T
pf = pd.DataFrame()
if par['select']:
pf = pf.append(pd.DataFrame([s.split(':') for s in par['select'].split(',')],index=['select']*(par['select'].count(',')+1)))
if par['remove']:
pf = pf.append(pd.DataFrame([s.split(':') for s in par['remove'].split(',')],index=['remove']*(par['remove'].count(',')+1)))
if par['include']:
pf = pf.append(pd.DataFrame([s.split(':') for s in par['include'].split(',')],index=['include']*(par['include'].count(',')+1)))
if par['exclude']:
pf = pf.append(pd.DataFrame([s.split(':') for s in par['exclude'].split(',')],index=['exclude']*(par['exclude'].count(',')+1)))
meta = [s for s in f.columns if sum([s2 in s for s2 in par['feature_identifier'].split(':')])==0]
if 'unclassified' in meta: meta.remove('unclassified')
feat = [s for s in f.columns if sum([s2 in s for s2 in par['feature_identifier'].split(':')])>0]
if 'unclassified' in f.columns: feat.append('unclassified')
for i in range(len(pf)):
if pf.index[i] == 'select':
f = f[f[pf.iloc[i,0]].isin(pf.iloc[i,1:])]
if pf.index[i] == 'remove':
f = f[-f[pf.iloc[i,0]].isin(pf.iloc[i,1:])]
if pf.index[i] == 'include':
if pf.iloc[i,0] != 'feature_level':
meta = [s for s in meta if s in pf.iloc[i,0:].tolist()]
else:
feat = [s for s in feat if (pf.iloc[i,1] in s) | ('unclassified' in s) ]
if pf.index[i] == 'exclude':
if pf.iloc[i,0] != 'feature_level':
if pf.iloc[i,0] == '_all_':
meta = []
else:
meta = [s for s in meta if s not in pf.iloc[i,0:].tolist()]
else:
if pf.iloc[i,1] == '_all_':
feat = []
else:
feat = [s for s in feat if pf.iloc[i,1] not in s]
f=f.loc[:,meta+feat]
f.loc[:,feat] = f.loc[:,feat].replace(to_replace='nd', value='0.0')
f.drop(f.loc[:,feat].columns[f.loc[:,feat].max().astype('float')==f.loc[:,feat].min().astype('float')], axis=1, inplace=True)
if par['out_f']:
if par['tout']:
f.to_csv(par['out_f'], sep='\t', header=True, index=False, line_terminator='\n')
else:
f.T.to_csv(par['out_f'], sep='\t', header=False, index=True, line_terminator='\n')
if __name__ == "__main__":
main(sys.argv)