-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathblast_parse.py.bak
executable file
·286 lines (230 loc) · 13.1 KB
/
blast_parse.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/python
from multiprocessing import Pool
import time
import os
import sys
import argparse
from homolog4 import *
from collections import defaultdict
# Copyright(C) 2015 David Ream
# Released under GPL version 3 licence. http://www.gnu.org/licenses/lgpl.html
# Do not remove this comment
# This exists to make the main function easier to read. It contains code to run the argument parser, and does nothing else.
def parser_code():
parser = argparse.ArgumentParser(description="Parse the results of a BLAST -m8 search and organize the results by specific gene blocks. The program will save the results in a directory designated by the user, or the default './blast_parse/'.")
parser.add_argument("-i", "--infolder", dest="infolder", default='./blast_result/', metavar="DIRECTORY",
help="A file that contains the path to every organism database that you are interested in.")
parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="DIRECTORY", default='./blast_parse/',
help="Folder where the BLAST results will be stored. Default is the folder './blast_result/'.")
# rename this option to gene block file... or something more accurately conveying the idea that this is the file
# that is composed: gene_block_name: gene1, gene2, gene3... etc all fields tab delenated.
parser.add_argument("-b", "--gene_block_query", dest="gene_block_query", default='gene_block_names_and_genes.txt', metavar="FILE",
help="A file that contains the names and genes comprising the gene blocks that are under investigation.")
parser.add_argument("-f", "--filter", dest="filter", default='', metavar="FILE",
help="A file that contains the accession numbers of the organisms that are under investigation.")
parser.add_argument("-n", "--num_proc", dest="num_proc", metavar="INT", default = os.sysconf("SC_NPROCESSORS_CONF"), type=int,
help="Number of processors that you want this script to run on. The default is every CPU that the system has.")
parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
help="Suppresses most program text outputs.")
return parser.parse_args()
def check_options(parsed_args):
if os.path.isdir(parsed_args.infolder):
infolder = parsed_args.infolder
else:
print "The infolder directory %s does not exist." % parsed_args.infolder
sys.exit()
# if the directory that the user specifies does not exist, then the program makes it for them.
if not os.path.isdir(parsed_args.outfolder):
os.makedirs(parsed_args.outfolder)
outfolder = parsed_args.outfolder
if outfolder[-1] != '/':
outfolder = outfolder + '/'
if os.path.exists(parsed_args.gene_block_query):
gene_block_query = parsed_args.gene_block_query
else:
print "The gene block query file %s does not exist." % parsed_args.gene_block_query
sys.exit()
if os.path.exists(parsed_args.filter):
filter_file = parsed_args.filter
elif parsed_args.filter == '':
filter_file = parsed_args.filter
else:
print "The filter file %s does not exist." % parsed_args.filter
sys.exit()
# section of code that deals determining the number of CPU cores that will be used by the program
if parsed_args.num_proc > os.sysconf("SC_NPROCESSORS_CONF"):
num_proc = os.sysconf("SC_NPROCESSORS_CONF")
elif parsed_args.num_proc < 1:
num_proc = 1
else:
num_proc = int(parsed_args.num_proc)
quiet = parsed_args.quiet
return infolder, outfolder, gene_block_query, filter_file, num_proc, quiet
#this function will return all of the files that are in a directory. os.walk is recursive traversal.
def returnRecursiveDirFiles(root_dir):
result = []
for path, dir_name, flist in os.walk(root_dir):
for f in flist:
fname = os.path.join(path, f)
if os.path.isfile(fname):
result.append(fname)
return result
# The filter file here i think should be either a vacant value (such as '') or a user defined
# value. I do not think by default it should be given, like I have made the default behavior.
def parallel_blast_parse_dict(in_folder, out_folder, num_proc, filter_file, gene_block_dict):
result = {}
gene_block_out_folder = out_folder
if not os.path.isdir(gene_block_out_folder):
os.makedirs(gene_block_out_folder)
if filter_file != '':
tmp = returnRecursiveDirFiles(in_folder)
nc_list = [i.strip() for i in open(filter_file).readlines()]
fname_list = [i for i in tmp if i.split('/')[-1].split('.')[0] in nc_list]
else:
fname_list = returnRecursiveDirFiles(in_folder)
# this is working correctly...
#print "blast_parse fname_list", fname_list
for fname in fname_list:
for line in [i.strip() for i in open(fname).readlines()]:
try:
hlog = Homolog.from_blast(line)
except:
print "Error in function parallel_blast_parse_dict from script blast_parse.py, conversion from result to Homolog class failed.", line
#hlog.Print()
# this might have to be changed....
#accession = hlog.accession()
try:
accession = hlog.accession()
except:
print "There was an error in the Homolog class, found in line function parallel_blast_parse_dict from script blast_parse.py"
#predicted_gene = hlog.blast_annatation()
predicted_gene = hlog.blast_annotation()
'''
try: # faster implementation than "if predicted_gene in gene_block_dict.keys():"
gene_block = gene_block_dict[predicted_gene]
# Debugging the missing gene block casABCDE12... no idea right now.
if gene_block == 'casABCDE12':
print 'AFDFAFDSF'
if gene_block in result.keys(): # check if the gene block is in the result dict already, if not make a new entry in the else clause
if accession in result[gene_block].keys(): # Check if the organism has been added to the gene block
result[gene_block][accession].append(hlog.ret_str())
else: # if the organim has not been added to the rest yet, add it
result[gene_block].update({accession:[hlog.ret_str()]})
else: # add the gene block to the result
result.update({gene_block: {accession: [hlog.ret_str()]}})
except:
pass
'''
try: # faster implementation than "if predicted_gene in gene_block_dict.keys():"
gene_block = gene_block_dict[predicted_gene]
# Debugging the missing gene block casABCDE12... no idea right now.
# ok, it's there, so omitting this, leaving the comment in for the time being though
#if gene_block == 'casABCDE12':
# print 'AFDFAFDSF'
if gene_block in result.keys(): # check if the gene block is in the result dict already, if not make a new entry in the else clause
if accession in result[gene_block].keys(): # Check if the organism has been added to the gene block
result[gene_block][accession].append(hlog.to_file())
else: # if the organims is not part of the gene block, add it
result[gene_block].update({accession:[hlog.to_file()]})
else: # add the gene_block to the result
result.update({gene_block: {accession: [hlog.to_file()]}})
except:
pass
#print sorted(result.keys()), len(result)
'''
# For the time being, i am going to cause each intermediate step in this pipeline to save in a folder called intermediate_for_debug
intermediate_folder = './intermediate_for_debug/'
if not os.path.isdir(intermediate_folder):
os.makedirs(intermediate_folder)
# For this step i will save the result in 'unfiltered_gene_block/'
unfilter_folder = 'unfiltered_gene_block/'
if not os.path.isdir(intermediate_folder + unfilter_folder):
os.makedirs(intermediate_folder + unfilter_folder)
'''
for gene_block in result.keys():
# this code is omitted because it used to debugging purposes, and is currently unneeded
'''
outfile = intermediate_folder + unfilter_folder + gene_block + '.txt'
#print "outfile", outfile
handle = open(outfile, 'w')
for accession in result[gene_block].keys():
handle.write('\n'.join(result[gene_block][accession]) + '\n')
handle.close()
'''
# save results where i actually want them to go:
#print "plast_parse.py outfile", out_folder + gene_block + '.txt'
handle = open(out_folder + gene_block + '.txt', 'w')
for accession in result[gene_block].keys():
handle.write('\n'.join(result[gene_block][accession]) + '\n')
handle.close()
# I have to figure out a better name for this function. The jist of what I am doing here is as follows:
# First, I will provide a file name that contains all of the hits for every organism that we are interested in.
# Then it sorts this homolog list first by organism, then by locus. (By the required input, the files already have
# been screened for both eval cutoff and gene block membership. The function will then return a dictionary for that
# gene block. The intention is that this data structure will then be used to find the best hit for the locus out of the
# many redundant hits, however this functionality will be handled another function that i have yet to write/test.
def return_gene_block_list(fname):
gene_block = fname.split('/')[-1].split('.')[0]
hlog_list = [Homolog.from_file(i.strip()) for i in open(fname).readlines()]
result_dict = {}
for hlog in hlog_list:
accession = hlog.accession()
locus = hlog.locus()
if accession not in result_dict.keys():
result_dict.update({accession: {}})
if locus not in result_dict[accession].keys():
result_dict[accession].update({locus: [hlog]})
else:
result_dict[accession][locus].append(hlog)
#print result_dict[accession]
return result_dict
# currently not used
'''
# might not use this: will see
def parallel_return_gene_block_list(infolder, outfolder, num_proc):
pool = Pool(processes = num_proc)
organism_dict_for_recovery = dict(pool.map(parallel_gene_block_fasta, genome_of_interest_list))
'''
# This function will take the organism-locus dict (per gene block file) and determine the best homolog.
def best_homolog_list(gene_block_dict, outfile):
result = []
for org in sorted(gene_block_dict.keys()):
for locus in sorted(gene_block_dict[org].keys()):
hlog_list = gene_block_dict[org][locus][1:]
best_hit = gene_block_dict[org][locus][0]
gene_count = defaultdict(int) # i am goign to use this, to see if a locus has more than one predicted gene, and the count ratio
gene_count[best_hit.predicted_gene()] +=1
for hlog in hlog_list:
gene_count[hlog.predicted_gene()] +=1
if best_hit.e_val() > hlog.e_val():
best_hit = hlog
#print gene_count.keys()
result.append(best_hit)
handle = open(outfile, 'w')
handle.write('\n'.join([i.ret_str() for i in result]))
handle.close()
def return_gene_to_gene_block_dict(fname):
gene_block_dict = {}
for line in [i.strip().split('\t') for i in open(fname).readlines()]:
gene_block = line[0]
for gene in line[1:]:
gene_block_dict.update({gene: gene_block})
return gene_block_dict
def main():
start = time.time()
parsed_args = parser_code()
infolder, outfolder, gene_block_query, filter_file, num_proc, quiet = check_options(parsed_args)
if not quiet:
print infolder, outfolder, gene_block_query, filter_file, num_proc, quiet
# This code makes a dictionary mapping gene annotation to the gene block that it belong to
gene_block_dict = return_gene_to_gene_block_dict(gene_block_query)
#print "gene_block_dict", gene_block_dict
#parallel_blast_parse_dict('./blast_parse/organism_raw_info/', './blast_parse/filtered_homologs/', num_proc, './genbank_pathway_lists/nc_filter_file.txt', gene_block_dict)
parallel_blast_parse_dict(infolder, outfolder, num_proc, filter_file, gene_block_dict)
#gene_block_dict = return_gene_block_list('./blast_parse/filtered_homologs/atpIBEFHAGDC.txt')
#best_homolog_list(gene_block_dict, './blast_parse/processed_gene_block_files/atpIBEFHAGDC.txt')
if not quiet:
print time.time() - start
# ./blast_parse.py -f phylo_order.txt
if __name__ == '__main__':
main()