forked from azodichr/MotifDiscovery
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_imp.py
executable file
·89 lines (74 loc) · 2.34 KB
/
parse_imp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Sorts importance files output by RandomForest_v2.0 and related SciKit-learn ML scripts and allows for other selection.
Required input:
-f : path to file or path to directory with multiple imp.txt files
Other options:
-n : Gives top n most important features
-p : Gives top percent p most important features
-value : default = True, if False then don't print pvalue in output
"""
import os, sys
import operator
n = "n"
cutoff = "n"
p = "n"
value = "True"
f = "help"
for i in range (1,len(sys.argv),2):
if sys.argv[i] == '-f': #Path to imp.txt file or to directory with files
f = sys.argv[i+1]
if sys.argv[i] == '-n': #Return the top n
n = int(sys.argv[i+1])
if sys.argv[i] == '-cutoff': #Return all features with imp over cutoff
cutoff = sys.argv[i+1]
if sys.argv[i] == '-p': #Return the top p percent
p = sys.argv[i+1]
if sys.argv[i] == '-value': #Return the top p percent
value = sys.argv[i+1]
def sort(f):
dic = {}
for l in open(f, 'r'):
kmer, val = l.strip().split("\t")
dic[kmer] = float(val)
sorted_dic = sorted(dic.items(), key=operator.itemgetter(1), reverse = True)
if n == p == "n":
name = f + "_sort"
out = open(name, 'w')
if value == "True":
for i in sorted_dic:
out.write("%s\t%s\n" % (i[0], i[1]))
if value == "False" or value == "false" or value == "f":
for i in sorted_dic:
out.write("%s\n" % (i[0]))
elif n != "n":
name = f + "_top" + str(n)
out = open(name, 'w')
kmer_list = sorted_dic[0:n]
if value == "True":
for i in kmer_list:
out.write("%s\t%s\n" % (i[0], i[1]))
if value == "False" or value == "false" or value == "f":
for i in kmer_list:
out.write("%s\n" % (i[0]))
elif p != "n":
name = f + "_top" + str(p) + "perc"
out = open(name, 'w')
top = int(float(len(sorted_dic)) * float(p) * 0.01)
kmer_list = sorted_dic[0:top]
if value == "True":
for i in kmer_list:
out.write("%s\t%s\n" % (i[0], i[1]))
if value == "False" or value == "false" or value == "f":
for i in kmer_list:
out.write("%s\n" % (i[0]))
if ".txt" in f:
print("Parsing given file")
sort(f)
else:
print("Parsing all .imp files in directory")
for j in os.listdir(f):
if j.startswith(".") or not "_imp.txt" in j:
pass
else:
print(j)
sort(j)