-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtax_level.py
125 lines (82 loc) · 2.45 KB
/
tax_level.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
import sys
import operator
#Theo Allnutt 2016
#This script opens a usearch otu table output and parses the taxonomy column (last column)
#to extract each taxonomy level separately. Levels are denoted by d:,p:,c:,o:,f:,g: before the name.
#input file is e.g. otutab.txt from:
#usearch -usearch_global trimmed.fa -db otus_tax.fa -strand both -id 0.97 -otutabout otutab.txt -biomout otutab.biom
#usage:
#python tax_level.py otu_table.txt
#output tables are made in the current directory with -p, -c, -o, -f, -g suffixes
def find1(a,b):
for p in a:
if b in p:
return a.index(p)
f = open(sys.argv[1],'r')
taxcols=int(sys.argv[2])
name1=sys.argv[1].split("/")[-1].split(".")[0]
g=open(name1+"-tax.txt",'w')
name1 = sys.argv[1].split("/")[-1].split(".")[0]
print name1
#n.b. taxa = d:,p:,c:,o:,f:,g:,s:
samples=[]
for i in f:
if i[0]=="#":
samples=i.split("\t")[1:]
n=len(samples)-1
else:
break
levels=["p:","c:","o:","f:","g:","s:"]
for level in levels:
data={}
f.seek(0)
for i in f:
if i[0]<>"#":
k = i.split("\t")[1:-taxcols]
c=-1
for x in k: #convert decimals into integers
c=c+1
x=int(x.split(".")[0])
k[c]=x
tax = i.split("\t")[-1].split(",")
tax[-1]=tax[-1].rstrip("\n") #remove return from last taxl in list
pos=find1(tax,level)
#print pos, tax, level
#raw_input()
if pos is not None:
#print 'pos',pos
name=tax[pos].split(":")[-1]
if name not in data.keys():
data[name]=k
else:
for x in range(0,len(k)):
data[name][x]=data[name][x]+k[x]
else:
#add to unclassified
if "unclassified" in data.keys():
for x in range(0,len(k)):
data["unclassified"][x]=data["unclassified"][x]+k[x]
else:
data["unclassified"]=k
#print level, tax,pos,k
#print data
#print "unclassified", data["unclassified"]
#g=open(name1+"-"+level[0]+".txt",'w') # to give separate files for each tax level
g.write(level+"\t"+"\t".join(str(x) for x in samples[:-taxcols])+"\t"+"sum"+"\n")
data_sums={}
sorted_taxa=[]
#sort by abundance
for i in data.keys():
data_sums[i]=sum(data[i])
sorted_taxa = sorted(data_sums.items(), key=operator.itemgetter(1))
print sorted_taxa
for v in sorted_taxa: #data.keys():
i=v[0]
print i
g.write(i+"\t"+"\t".join(str(x) for x in data[i])+"\t")
sum1=0
for s in data[i]:
sum1=sum1+float(s)
g.write(str(sum1)+"\n")
g.write("\n")