-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetamlst.py
executable file
·297 lines (206 loc) · 11.8 KB
/
metamlst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/usr/bin/env python3
from __future__ import print_function
try:
import argparse
import math
import sqlite3
import subprocess
import time
import gc
import os
import sys
from metaMLST_functions import *
except ImportError as e:
print ("Error while importing python modules! Remember that this script requires: sys, os, subprocess, sqlite3, argparse, re and pysam\nError: "+str(e))
sys.exit(1)
try:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
except ImportError as e:
print(e)
metamlst_print("Failed in importing Biopython. Please check Biopython is installed properly on your system!",'FAIL',bcolors.FAIL)
sys.exit(1)
#METAMLST_DBPATH=os.path.abspath(os.path.dirname(__file__))+'/metamlst_databases/metamlstDB_2019.db'
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Reconstruct the MLST loci from a BAMFILE aligned to the reference MLST loci')
parser.add_argument("BAMFILE", help="BowTie2 BAM file containing the alignments",nargs='?')
parser.add_argument("-o", metavar="OUTPUT FOLDER", help="Output Folder (default: ./out)", default='./out')
parser.add_argument("-d",'--database', metavar="DB PATH", help="Specify a different MetaMLST-Database. If unset, use the default Database. You can create a custom DB with metaMLST-index.py)")
parser.add_argument("--filter", metavar="species1,species2...", help="Filter for specific set of organisms only (METAMLST-KEYs, comma separated. Use metaMLST-index.py --listspecies to get MLST keys)")
parser.add_argument("--penalty", metavar="PENALTY", help="MetaMLST penaty for under-represented alleles", default=100, type=int)
parser.add_argument("--minscore", metavar="MINSCORE", help="Minimum alignment score for each alignment to be considered valid", default=80, type=int)
parser.add_argument("--max_xM", metavar="XM", help="Maximum SNPs rate for each alignment to be considered valid (BowTie2s XM value)", default=5, type=int)
parser.add_argument("--min_read_len", metavar="LENGTH", help="Minimum BowTie2 alignment length", default=50, type=int)
parser.add_argument("--min_accuracy", metavar="CONFIDENCE", help="Minimum threshold on Confidence score (percentage) to pass the reconstruction step", default=0.90, type=float)
parser.add_argument("--debug", help="Debug Mode", action='store_true', default=False)
parser.add_argument("--presorted", help="The input BAM file is sorted and indexed with samtools. If set, MetaMLST skips this step", action='store_true')
parser.add_argument("--quiet", help="Suppress text output", action='store_true')
parser.add_argument("--version", help="Prints version informations", action='store_true')
parser.add_argument("--nloci", metavar="NLOCI", help="Do not discard samples where at least NLOCI (percent) are detected. This can lead to imperfect MLST typing", default=100, type=int)
parser.add_argument("--log", help="generate logfiles", action="store_true")
parser.add_argument("-a", help="Write known sequences", action="store_true")
#parser.print_help()
args=parser.parse_args()
if args.version: print_version()
if not args.BAMFILE:
parser.print_help()
sys.exit(0)
#PREPARE
try:
#download the database if a non existing (but default-named) DB file is passed
if not args.database:
dbPath=check_install()
else:
dbPath=args.database
metaMLSTDB = metaMLST_db(dbPath)
conn = metaMLSTDB.conn
cursor = metaMLSTDB.cursor
except IOError:
metamlst_print("Failed to connect to the database: please check your database file!",'FAIL',bcolors.FAIL)
sys.exit(1)
try:
fil = open(args.BAMFILE,'r')
except:
metamlst_print('Unable to open the BAM file for reading: please check the path is correct','FAIL',bcolors.FAIL)
sys.exit(1)
cel = {}
sequenceBank = {}
ignoredReads = 0
totalReads = 0
glob_bamFile_sorted=False
fileName = (args.BAMFILE.split('/'))[-1].split('.')[0] if args.BAMFILE != '' else 'STDIN_'+str(int(time.time()))
if not os.path.isdir(args.o): os.mkdir(args.o)
workUnit = args.o
try:
child = subprocess.Popen(["samtools","view","-h","-"], stdout=subprocess.PIPE, stdin = fil)
except OSError as e:
print ("Error while executing samtools. Please check samtools is installed properly on your system!",e)
sys.exit(1)
for line in child.stdout:
line=line.decode()
if(line[0] != '@'):
read = line.split('\t')
readCode = read[0]
species,gene,allele = read[2].split('_')
score = (int)((read[11]).split(':')[2])
xM = (int)((read[14]).split(':')[2])
sequence = read[9]
quality = read[10]
if (args.filter and species in args.filter.split(',')) or not args.filter:
if score >= args.minscore and len(sequence) >= args.min_read_len and xM <= args.max_xM:
if species not in cel: cel[species] = {} #new species
if gene not in cel[species]: #found a new gene
cel[species][gene] = {}
sequenceBank[species+'_'+gene] = {}
if allele not in cel[species][gene]: #found a new allele
cel[species][gene][allele] = []
cel[species][gene][allele].append(score)
sequenceBank[species+'_'+gene][readCode]=len(sequence)
else: ignoredReads = ignoredReads+1
totalReads = totalReads +1
#COMPILES THE DATA STRUCTURE
for speciesKey,species in cel.items():
for geneKey, geneInfo in species.items(): #geni
maxLen = max([len(x) for x in geneInfo.values()])
for alleleCode,bowtieValues in geneInfo.items(): #alleli passata 2
geneLen = len(bowtieValues)
localScore = sum(item for item in bowtieValues)
#print "GENELEN",geneKey,alleleCode,'a=',geneLen,'b=',maxLen, 'c=',localScore
if geneLen != maxLen:
localScore = localScore - (maxLen-geneLen)*args.penalty
averageScore = float(localScore)/float(geneLen)
cel[speciesKey][geneKey][alleleCode] = (localScore,geneLen,round(averageScore,1))
# cel ['haemophilus']['haemophilus_gene'][allele] = (a, b, c) --> a: summed score, b = geneLen, c = a/b
###################################### OUT FILE 1 (LOG)
if args.log:
dfil = open(workUnit+'/'+fileName+'_'+str(int(time.time()))+'.out','w')
dfil.write("SAMPLE:\t\t\t\t\t"+args.BAMFILE+'\r\n')
dfil.write("VERSION:\t\t\t\t\t1.1\r\n")
dfil.write("PENALTY:\t\t\t\t"+repr(args.penalty)+'\r\n')
dfil.write("MIN-THRESHOLD SCORE:\t\t\t\t"+repr(args.minscore)+'\r\n')
dfil.write("TOTAL ALIGNED READS:\t\t\t\t"+repr(totalReads)+'\r\n')
dfil.write(" - OF WHICH IGNORED:\t\t\t\t"+repr(ignoredReads)+' BAM READS\r\n\r\n------------------------------ RESULTS ------------------------------\r\n')
for speciesKey,species in cel.items():
for geneKey, geneInfo in species.items(): #geni
for geneInfoKey,(score,geneLen,average) in sorted(geneInfo.items(), key= lambda x: x[1]): #alleli ordinati
dfil.write('\t'.join(map(str,[speciesKey,geneKey,geneInfoKey,score,geneLen,average]))+'\r\n')
dfil.close()
if not args.quiet:
print (bcolors.OKBLUE+'Sample file: '+bcolors.ENDC+os.path.realpath(fileName))
print (bcolors.OKBLUE+'MetaMLST Database file: '+bcolors.ENDC+os.path.basename(dbPath)+'\n')
#print ('\r\n'+bcolors.OKBLUE+(' '+fileName+' ').center(80,'-')+bcolors.ENDC)
for speciesKey,species in cel.items():
#tVar is the dictionary of locus detected in the organism
tVar = dict([(row['geneName'],0) for row in cursor.execute("SELECT geneName FROM genes WHERE bacterium = ?",(speciesKey,))])
#GENE PRESENCE
if len(tVar) < len(species.keys()):
if not args.quiet: print ('Database is broken for' +speciesKey+bcolors.FAIL+'[ - EXITING - ]'.rjust(75,' ')+bcolors.ENDC)
sys.exit(0)
for sk in species.keys():
tVar[sk] = 1
vals = sum([t for t in tVar.values()])
if not args.quiet:
gcolor= bcolors.OKGREEN if (int((float(vals)/float(len(tVar)))*100) >= args.nloci) else bcolors.FAIL
print(gcolor+' '+speciesKey.ljust(18,' ')+bcolors.ENDC+' Detected Loci: '+', '.join( [bcolors.OKGREEN + sk + bcolors.ENDC for (sk,v) in sorted(tVar.items(), key = lambda x:x[0]) if v == 1] ))
if any(v == 0 for v in tVar.values() ): print((' '*20)+'Missing Loci : '+', '.join( [bcolors.FAIL + sk + bcolors.ENDC for (sk,v) in sorted(tVar.items(), key = lambda x:x[0]) if v == 0] ))
print ("")
if int((float(vals)/float(len(tVar)))*100) >= args.nloci:
if not args.quiet: metamlst_print("Closest allele identification",'...',bcolors.HEADER)
if not args.quiet: print ("\r\n "+"Locus".ljust(7)+"Avg. Coverage".rjust(15)+"Score".rjust(7)+"Hits".rjust(6)+" Reference Allele(s)".ljust(36))
sys.stdout.flush()
for geneKey, geneInfo in sorted(species.items(),key=lambda x:x[0]): #loci
minValue = max([avg for (val,leng,avg) in geneInfo.values()])
aElements = {}
for k,(val,leng,avg) in geneInfo.items():
if avg == minValue:
aElements[k]=(val,leng,avg)
closeAllelesList = ','.join([str(a) for a in sorted(aElements.keys(), key=lambda x: int(x))][:5])+('... ('+str(len(aElements))+' more)' if len(aElements) > 5 else '')
sequenceKey = speciesKey+'_'+geneKey
cursor.execute("SELECT LENGTH(sequence) as L FROM alleles WHERE bacterium = ? AND gene = ? ORDER BY L DESC LIMIT 1", (speciesKey,geneKey))
genL = cursor.fetchone()['L']
coverage = sum([x for x in sequenceBank[sequenceKey].values()])
if not args.quiet: print (" "+bcolors.WARNING+geneKey.ljust(7)+bcolors.ENDC+str(round(float(coverage)/float(genL),2)).rjust(15)+bcolors.ENDC+bcolors.HEADER+str(minValue).rjust(7)+str(list(aElements.values())[0][1]).rjust(6)+bcolors.ENDC+bcolors.OKBLUE,closeAllelesList.ljust(36)+bcolors.ENDC)
if not args.quiet: print ("")
sys.stdout.flush()
sampleSequence = '' #for organism
if not args.quiet: metamlst_print("Building Consensus Sequences",'...',bcolors.HEADER)
if not args.presorted and not glob_bamFile_sorted:
sort_index(args.BAMFILE)
glob_bamFile_sorted=True
l = [sorted([(speciesKey+'_'+g1+'_'+k,db_getUnalSequence(metaMLSTDB.conn,speciesKey,g1,k)) for k,(val,leng,avg) in g2.items() if avg == max([avg1 for (val1,leng1,avg1) in g2.values()])],key=lambda x: int(x[0].split('_')[2]))[0] for g1,g2 in species.items()]
e1 = dict(l)
consenSeq = buildConsensus(args.BAMFILE, dict(l),args.minscore,args.max_xM,args.debug)
newProfile = 0
finWrite = 1
if not args.quiet: print ("\r\n "+"Locus".ljust(7)+"Ref.".ljust(7)+"Length".rjust(7)+"Ns".rjust(7)+"SNPs".rjust(7)+"Confidence".rjust(15)+"Notes".rjust(10))
for l in sorted(consenSeq, key= lambda x: x.id):
holes = str(l.description.split('_')[0].split('::')[1])
snps = int(l.description.split('_')[1].split('::')[1])
leng = str(len(l.seq))
leng_ns = str(round(1-float(holes)/float(leng),4)*100)+' %'
l.seqLen = len(l.seq)
# If there's a gene with low accuracy, the whole organism is discarded for this sample
if (1-float(holes)/float(leng)) <= args.min_accuracy: finWrite = 0
if snps > 0:
seqFind = sequenceFind(metaMLSTDB.conn,speciesKey,l.seq)
if seqFind: newAllele = seqFind
else:
newAllele = 'NEW'
newProfile = 1
else:
newAllele = '--'
if not args.a: l.seq = ''
if not args.quiet: print (" "+bcolors.WARNING+(l.id.split('_')[1]).ljust(7)+bcolors.ENDC+(l.id.split('_')[2]).ljust(7)+leng.rjust(7)+holes.rjust(7)+str(snps).rjust(7)+leng_ns.rjust(15)+newAllele.rjust(10))
if not args.quiet: print ('')
if finWrite:
if not args.quiet: metamlst_print("Reconstruction Successful",'WRITE',bcolors.OKGREEN)
profil = open(workUnit+'/'+fileName+'.nfo','a')
profil.write(speciesKey+'\t'+fileName+'\t'+"\t".join([recd.id+"::"+str(recd.seq)+'::'+str(round(1-float(recd.description.split('_')[0].split('::')[1])/float(recd.seqLen),4)*100)+'::'+str(round(float(recd.description.split('_')[1].split('::')[1])/float(recd.seqLen),4)*100) for recd in consenSeq])+'\r\n')
profil.close()
else:
if not args.quiet: metamlst_print("Accuracy lower than "+str(round(args.min_accuracy*100,2))+'%','SKIP',bcolors.FAIL)
cel[speciesKey] = None
gc.collect()
metaMLSTDB.closeConnection()
if len(cel) and not args.quiet: print ('\033[92m'+'[ - Completed - ]'.rjust(80,' ')+'\033[0m')