-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgene-matrix-from-uclust4.py
124 lines (68 loc) · 1.86 KB
/
gene-matrix-from-uclust4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/python
import os
import re
import glob
import sys
import subprocess
#T. Allnutt, CSIRO, 2015
#script to parse the uclust .uc file and make a table of clusters with their %identity
#usage: gene-matrix-from-uclust2.py clusters_sorted.uc pangenome.txt 60 -cds-
#nb '95' is the identiy threshold below which presence of a gene is not scored
digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token
for token, match in
((fragment, digits.search(fragment))
for fragment in digits.split(filename)))
f = open(sys.argv[1],'r')
g = open(sys.argv[2],'w')
delim = sys.argv[3]
min_samples=int(sys.argv[4])
min_sum=int(sys.argv[5])
data = {}
hitlist=[]
c=1
ct=0
n=0
print "Parsing uc file"
for line in f:
k = line.split("\t")
if k[0]<>"C":
hit=k[8].split(delim)[0] #nb the delim separates the cds number from the genome id, so only score the genome id as a hit
if hit not in hitlist:
hitlist.append(hit)
if k[0]=="S":
n=0
ct=ct+1
cds = k[8]
data[cds]={} #new cluster
data[cds][hit]=1 #self identity
if k[0]=="H":
n=n+1
if hit not in data[cds].keys():
data[cds][hit]=1
else:
data[cds][hit]=data[cds][hit]+1
print "writing"
hitlist.sort(key=tokenize)
title="\t"+"\t".join(str(p)for p in hitlist)+"\n"
g.write(title)
output=""
for i in data.keys():
#sample count
sample_count=0
gene_sum=0
for t in data[i].values():
if t>0:
sample_count=sample_count+1
for t in data[i].values():
gene_sum=gene_sum+t
if gene_sum >= min_sum and sample_count >= min_samples:
c=c+1
g.write(i)
for j in hitlist:
if j in data[i].keys():
g.write("\t"+str(data[i][j]))
else:
g.write("\t"+"0")
g.write(output+"\n")