-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTaxonomyAlign.py
526 lines (422 loc) · 23.1 KB
/
TaxonomyAlign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
import argparse
import logging
import time
import shutil
import os
import subprocess as sp
from Bio import SeqIO
from Bio.Seq import Seq
def get_args():
"""
Get arguments from command line.
"""
parser = argparse.ArgumentParser(
description="""TaxonomyAlign: Use to perform multiple sequence alignment for a highly variable gene
and a large number of divergent sequences. This program automatically divides the sequences
into smaller subsets based on user-supplied taxonomic groupings, and builds sub-alignments for the groups using MAFFT
(several algorithm options available). The defined groupings should be related to both taxonomy and phylogeny (e.g., genus,
family, superfamily, etc). The resulting sub-alignments are then merged together with MAFFT, with or without an optional
polishing step. The original use-case was for creating high-quality alignments from many 12S and 16S mtDNA sequences
across a large phylogenetic scale. However, it should prove useful for other genes with many species and divergent sequences
to align.""")
parser.add_argument("-f", "--fasta",
required=True,
help="REQUIRED: The full path to the input fasta file of unaligned sequences.")
parser.add_argument("-m", "--map",
required=True,
help="REQUIRED: The full path to a map file containing sequence names "
"and corresponding groupings/taxonomy.")
parser.add_argument("-o", "--outdir",
required=True,
help="REQUIRED: The full path to an existing directory "
"to write output files.")
parser.add_argument("-a", "--algorithm",
required=False,
default="auto",
choices=["auto", "FFT-NS-i", "E-INS-i", "L-INS-i", "G-INS-i"],
help="OPTIONAL: The MAFFT algorithm to use: 1) auto-select based on input sequences, "
"2) FFT-NS-i is a progressive and general algorithm, 3) E-INS-i is best for several "
"conserved core regions and variable regions (e.g., 12S and 16S mtDNA), "
"4) L-INS-i is best for a single conserved core region with variable flanking regions, and 5) "
"G-INS-i is best for full-length single core sequences. Defaults to auto.")
parser.add_argument("-t", "--threads",
default=None,
help="OPTIONAL: Specifies number of threads to use.")
parser.add_argument("--polish",
action='store_true',
help="OPTIONAL: Performs additional distance measure refinement on final merged alignment, "
"at cost of increasing run time (sometimes substantially).")
parser.add_argument("--output_seqnames",
action='store_true',
help="OPTIONAL: Write sorted names of all sequences in fasta file to stdout and quit. "
"Must still provide -f, -m, and -o required flags above (though value only matters for -f).")
parser.add_argument("--verbose",
action='store_true',
help="OPTIONAL: Show mafft progress during final merge step instead of redirecting to log file. "
"May be useful for monitoring the --polish option, which can be slow for larger datasets.")
return parser.parse_args()
def setup_logging(outdir):
"""
Set up logging to log file and on-screen.
Args:
outdir - full path to existing directory to write outputs
"""
# set up logging to file
logging.basicConfig(filename=os.path.join(outdir, "Taxonomy-Align_{}.log".format(time.strftime("%b-%d-%Y_%I.%M.%S"))),
format="%(levelname)s: %(asctime)s: %(message)s",
datefmt='%d-%b-%y %H:%M:%S',
level=logging.DEBUG)
# set up logging to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s",
datefmt='%H:%M:%S')
console.setFormatter(formatter)
logging.getLogger().addHandler(console)
def make_output_paths(outdir):
"""
Function to create output directories in directory specified by user.
Args:
outdir - full path to existing directory to write outputs
Returns:
dir_groups - full path to directory 1-Group-Fasta-Files
dir_alns - full path to directory 2-Group-Alignments
dir_merged - full path to directory 3-Merged-Alignments
"""
os.chdir(outdir)
curpath = os.getcwd()
dir_groups = os.path.join(curpath, "1-Group-Fasta-Files")
dir_alns = os.path.join(curpath, "2-Group-Alignments")
dir_merged = os.path.join(curpath, "3-Merged-Alignments")
for path in [dir_groups, dir_alns, dir_merged]:
if not os.path.exists(path):
os.mkdir(path)
return dir_groups, dir_alns, dir_merged
def fasta_to_dict(fasta_file):
"""
Function to convert any fasta file (f) into
a dictionary structure with taxon as key and
sequence as value.
Args:
fasta_file - full path to the input fasta file of unaligned sequences
Returns:
fasta_dict - dictionary of seq names as keys and seqs as values
"""
logging.info("Parsing fasta file.")
fasta_dict = {}
with open(fasta_file, 'r') as fh:
lines = [l.strip() for l in fh if l.strip()]
for line in lines:
if line.startswith(">"):
new_key = line.replace(">",'')
fasta_dict[new_key] = ""
else:
fasta_dict[new_key] += line.upper()
logging.info("Found {:,} sequences in fasta file.\n\n".format(len(fasta_dict)))
return fasta_dict
def map_to_dict(map_file):
"""
Processes map file to obtain a dictionary of group names as keys and
a list of taxa as the values. Returns this dictionary.
The map file should be tab-delimited and contain exactly two columns. Column one
is the name of a sequence (ideally in format like 'Hyperolius_guttulatus') and
column two is the group assignment for that sequence ('Hyperoliidae'). It is
expected that multiple sequences will be assigned to the same group, and that there
must be at least two groups in order to run this program.
Args:
map_file - full path to the input map file of seq names and group names
Returns:
map_dict - dictionary of group names as keys and list of seq names as values
"""
logging.info("Parsing map file.")
map_dict = {}
with open(map_file, 'r') as fh:
for line in fh:
if line.strip() and len(line.split()) == 2:
if line.strip().split()[1] not in map_dict:
map_dict[line.strip().split()[1]] = [line.strip().split()[0]]
else:
map_dict[line.strip().split()[1]].append(line.strip().split()[0])
logging.info("Found {:,} groups in map file.\n\n".format(len(map_dict)))
return map_dict
def check_map_fasta_compatibility(fasta_dict, map_dict):
"""
Perform a quick check to see which sequences in the fasta file
can be matched to those provided in the mapping file. Reports all
unmatched sequences in the fasta file.
Args:
fasta_dict - dictionary of seq names as keys and seqs as values (produced by fasta_to_dict())
map_dict - dictionary of group names as keys and list of seq names as values (produced by map_to_dict())
"""
fasta_taxa = set(list(fasta_dict.keys()))
map_taxa = set()
for k, v in map_dict.items():
for i in v:
map_taxa.add(i)
unique_fasta_taxa = fasta_taxa - map_taxa
if len(unique_fasta_taxa) == 0:
logging.info("All sequences in fasta file have an assigned grouping.\n\n")
else:
logging.info("The following {:,} sequences are not assigned to a group:\n{}".format(len(unique_fasta_taxa), "\n\t".join(sorted(unique_fasta_taxa))))
logging.info("They will be excluded from the analysis.\n\n")
def write_group_fastas(fasta_dict, map_dict, dir_groups):
"""
Write group-specific fasta files to output directory.
Args:
fasta_dict - dictionary of seq names as keys and seqs as values (produced by fasta_to_dict())
map_dict - dictionary of group names as keys and list of seq names as values (produced by map_to_dict())
dir_groups - full path to full path to directory 1-Group-Fasta-Files
"""
logging.info("Writing fasta files for groups:")
for group in sorted(map_dict.keys()):
# get intersection (common vals) of the taxon sets
matched_taxa = set(map_dict[group]) & set(list(fasta_dict.keys()))
# if one sequence present for group, write singleton fasta
if len(matched_taxa) == 1:
# remove any existing version of this group fasta file
out_fasta = os.path.join(dir_groups, "{}.singleton.fasta".format(group))
if os.path.exists(out_fasta):
logging.warning("Removing prior existing file: {}".format(out_fasta))
os.remove(out_fasta)
with open(out_fasta, 'a') as fh:
for taxon in sorted(matched_taxa):
fh.write(">{}\n{}\n".format(taxon, fasta_dict[taxon]))
logging.info("\tGroup {} is represented by {} fasta sequences.".format(group, len(matched_taxa)))
# if 2 or more sequences present for group, write multiples fasta
elif len(matched_taxa) > 1:
# remove any existing version of this group fasta file
out_fasta = os.path.join(dir_groups, "{}.multiples.fasta".format(group))
if os.path.exists(out_fasta):
logging.warning("Removing prior existing file: {}".format(out_fasta))
os.remove(out_fasta)
with open(out_fasta, 'a') as fh:
for taxon in sorted(matched_taxa):
fh.write(">{}\n{}\n".format(taxon, fasta_dict[taxon]))
logging.info("\tGroup {} is represented by {} fasta sequences.".format(group, len(matched_taxa)))
# if no sequences present for group, skip fasta writing
else:
logging.info("\tGroup {} is NOT represented in fasta sequences, skipping.".format(group, len(matched_taxa)))
logging.info("Finished writing group fasta files.\n\n")
def reformat_mafft_output_fasta(temp_fasta, out_fasta):
"""
Checks the alignment output file of mafft to ensure worked, and if so
will cleanup the format and write to the final alignment file.
Args:
temp_fasta - the full path to the alignment file produced directly by mafft
out_fasta - the full path to the final alignment file to write after cleaning
"""
if not os.stat(temp_fasta).st_size != 0:
logging.info("ERROR: Alignment failed. Please check log file for details.\n\n")
else:
if os.path.exists(out_fasta):
logging.warning("Removing prior existing file: {}".format(out_fasta))
os.remove(out_fasta)
fdict = SeqIO.index(temp_fasta, "fasta")
with open(out_fasta, 'a') as fh:
for record in fdict:
newseq = fdict[record].seq.upper()
fh.write( ">{}\n{}\n".format(fdict[record].description, newseq))
os.remove(temp_fasta)
logging.info("Finished alignment!\n\n")
def align_individual_group_fastas(dir_groups, dir_alns, threads, algorithm):
"""
Runs mafft with the specified algorithm for all group fasta files containing
at least two sequences. Will clean the outputs from mafft to write a nicely
formatted output alignment fasta file.
Args:
dir_groups - full path to directory 1-Group-Fasta-Files
dir_alns - full path to directory 2-Group-Alignments
threads - optional number of threads to use for mafft
algorithm - user choice for alignment algorithm to run in mafft
"""
# show information about singleton groups, if they exist
singleton_fasta_list = [os.path.join(dir_groups, f) for f in os.listdir(dir_groups) if f.endswith(".singleton.fasta")]
if singleton_fasta_list:
logging.info("There are {:,} group fasta files containing a single sequence:".format(len(singleton_fasta_list)))
for f in singleton_fasta_list:
logging.info("\t{}".format(f.split('/')[-1]))
logging.info("These will be individually aligned to the merged group alignment.\n\n".format(len(singleton_fasta_list)))
# show information about multiples groups
multiples_fasta_list = sorted([os.path.join(dir_groups, f) for f in os.listdir(dir_groups) if f.endswith(".multiples.fasta")])
logging.info("Found {:,} group fasta files suitable for alignment.\n\n".format(len(multiples_fasta_list)))
# if all groups are single sequence, kill the program
if not multiples_fasta_list:
message = "ERROR: No group fasta files contain >1 sequence!"
logging.info("{}".format(message))
raise ValueError(message)
if threads:
thread_str = "--thread {}".format(threads)
else:
thread_str = ""
algo_dict = {"FFT-NS-i":"--maxiterate 1000",
"E-INS-i":"--genafpair --maxiterate 1000",
"L-INS-i":"--localpair --maxiterate 1000",
"G-INS-i":"--globalpair --maxiterate 1000",
"auto":"--auto"}
# iterate over group fasta files to perform individual alignments
for input_fasta in multiples_fasta_list:
name = input_fasta.split('/')[-1].split(".")[0]
logging.info("Beginning alignment of {} using {} algorithm.".format(name, algorithm))
# set names for all outputs
temp_fasta = os.path.join(dir_alns, "{}.mafft.temp.fasta".format(name))
if os.path.exists(temp_fasta):
logging.warning("Removing prior existing file: {}".format(temp_fasta))
os.remove(temp_fasta)
out_fasta = os.path.join(dir_alns, "{}.mafft-{}.fasta".format(name, algorithm))
log = os.path.join(dir_alns, "{}.mafft-{}.log".format(name, algorithm))
# generate mafft command and execute
mafft_call = "mafft {0} {1} {2} > {3} 2> {4}".format(thread_str, algo_dict[algorithm], input_fasta, temp_fasta, log)
logging.info("{}".format(mafft_call))
sp.call(mafft_call, shell=True)
# reformat output file if alignment successful - will also notify if alignment failed
reformat_mafft_output_fasta(temp_fasta, out_fasta)
logging.info("Finished all group-specific alignments.\n\n")
def prepare_subalignments_for_merge(dir_groups, dir_alns, dir_merged, algorithm):
"""
This function prepares a concatenated sub-alignments fasta file and a
sequence assignment table. These are mafft-specific input files required
for the --merge algorithm.
The sub-alignments are simply concatenated in fasta format, but the numerical sequence order
is used to specify subalignment sequence sets in the table file.
Example table:
1 2 3 4 5 # this is comment. subMSA1
6 7 8 # you can write anything after #
9 10 # subMSA3
In the above, sequences 1-5 are aligned, 6-8 are aligned, 9-10 are aligned, and any other additional sequences in
the concatenated sub-alignments fasta file (e.g., 10, 11, 12, etc.) are considered unaligned. Those unaligned
sequences will get aligned to the merged profile alignment during this final step.
The output table produced here will have the group name commented after the sequence numbers:
1 2 3 4 5 # Group = Hyperoliidae
Args:
dir_groups - full path to directory 1-Group-Fasta-Files
dir_alns - full path to directory 2-Group-Alignments
dir_merged - full path to directory 3-Merged-Alignments
algorithm - user choice for alignment algorithm to run in mafft
Returns:
table - full path to the sequence assignment table
cataln - full path to the concatenated sub-alignments fasta file
"""
logging.info("Preparing sub-alignment file and sequence table.\n\n")
# start sequence counter
seq_number = int(1)
# create table filename
table = os.path.join(dir_merged, "Merge-Table.txt")
if os.path.exists(table):
logging.warning("Removing prior existing file: {}".format(table))
os.remove(table)
# create concatenated msa filename
cataln = os.path.join(dir_merged, "Sub-Alignments.fasta")
if os.path.exists(cataln):
logging.warning("Removing prior existing file: {}".format(cataln))
os.remove(cataln)
# iterate over aligned fasta files and write required input files
aligned_fasta_list = sorted([os.path.join(dir_alns, f) for f in os.listdir(dir_alns) if f.endswith(".mafft-{}.fasta".format(algorithm))])
if not aligned_fasta_list:
message = "ERROR: No aligned fasta files found!"
logging.info("{}".format(message))
raise ValueError(message)
else:
logging.info("Adding sub-alignments:")
for aln_file in aligned_fasta_list:
temp_seq_numbers = []
with open(cataln, 'a') as fhout, open(aln_file, 'r') as fhin:
for line in fhin:
if line.startswith('>'):
temp_seq_numbers.append(str(seq_number))
seq_number += 1
fhout.write(line)
with open(table, 'a') as fh:
fh.write("{} # Group = {}\n".format(" ".join(temp_seq_numbers), aln_file.split('/')[-1].split('.')[0]))
logging.info("\tsub-alignment {}: wrote {:,} aligned sequences ({:,} - {:,})".format(aln_file.split('/')[-1],
len(temp_seq_numbers),
int(temp_seq_numbers[0]), int(temp_seq_numbers[-1])))
logging.info("Finished adding sub-alignments.\n\n")
# add singleton sequences to end of fasta file, no numbers added to table because these are unaligned
singleton_fasta_list = [os.path.join(dir_groups, f) for f in os.listdir(dir_groups) if f.endswith(".singleton.fasta")]
if singleton_fasta_list:
logging.info("Writing singleton group sequences to sub-alignment file:")
for fasta_file in singleton_fasta_list:
with open(cataln, 'a') as fhout, open(fasta_file, 'r') as fhin:
for line in fhin:
if line.startswith('>'):
logging.info("\tAdding single sequence from {}: {}".format(fasta_file.split('/')[-1].split('.')[0], line.strip().strip('>')))
fhout.write(line)
logging.info("Finished adding singleton group sequences.\n\n")
else:
logging.info("No singleton group sequences to add to sub-alignment file.\n\n")
logging.info("Finished preparing sub-alignment file and sequence table.\n\n")
return table, cataln
def run_merge(table, cataln, dir_merged, threads, polish, verbose):
"""
Performs the merge algorithm in mafft based on the sequence assignment table
and concatenated sub-alignments fasta file produced.
Options for running mafft merge algorithm:
1) For basic merge run:
mafft --merge table input > output
2) A more rigorous distance measure can be used for small data:
mafft --localpair --merge table input > output
3) Iterative refinement is supported in version 7.040 and higher:
mafft --localpair --maxiterate 10 --merge table input > output
The absence of --polish invokes option 1 above, the presence of --polish
will invoke option 3.
Args:
table - full path to the sequence assignment table
cataln - full path to the concatenated sub-alignments fasta file
dir_merged - full path to directory 3-Merged-Alignments
algorithm - user choice for alignment algorithm to run in mafft
threads - optional number of threads to use for mafft
polish - a true/false toggle for the distance measure + interative refinement step after merge
"""
logging.info("Starting the alignment merging step.")
# set names for all outputs
temp_fasta = os.path.join(dir_merged, "temp.fasta")
out_fasta = os.path.join(dir_merged, "Merged-alignment.fasta")
log = os.path.join(dir_merged, "Merged-alignment.log")
if threads:
thread_str = "--thread {} ".format(threads)
else:
thread_str = ""
if polish is True:
polish_str = "--localpair --maxiterate 10"
else:
polish_str = ""
if verbose is True:
err_str = ""
else:
err_str = "2> {}".format(log)
# generate mafft command and execute
mafft_call = "mafft {0} {1} --merge {2} {3} > {4} {5}".format(thread_str, polish_str, table, cataln, temp_fasta, err_str)
logging.info("\t{}".format(mafft_call))
sp.call(mafft_call, shell=True)
# reformat output file if alignment successful - will also notify if alignment failed
reformat_mafft_output_fasta(temp_fasta, out_fasta)
logging.info("Process complete!")
logging.info("Please inspect Merged-alignment.fasta file available in: {}.\n\n".format(dir_merged))
def main():
args = get_args()
if args.output_seqnames is True:
# turn input fasta file into dictionary structure (keys = seq names, vals = seqs)
fasta_dict = fasta_to_dict(args.fasta)
for name in sorted(fasta_dict.keys()):
print(name)
else:
# setup log file and on-screen settings
setup_logging(args.outdir)
# create output directories and return their paths
dir_groups, dir_alns, dir_merged = make_output_paths(args.outdir)
# turn input fasta file into dictionary structure (keys = seq names, vals = seqs)
fasta_dict = fasta_to_dict(args.fasta)
# turn mapping file into dictionary structure (keys = group names, vals = lists of seq names)
map_dict = map_to_dict(args.map)
# check whether all sequences in fasta file can be assigned to groups
check_map_fasta_compatibility(fasta_dict, map_dict)
# write the group-specific fasta files
write_group_fastas(fasta_dict, map_dict, dir_groups)
# align each group-specific fasta file, e.g. prepare sub-alignments
align_individual_group_fastas(dir_groups, dir_alns, args.threads, args.algorithm)
# prepare two required input files for mafft --merge based on sub-alignments produced, return paths to those input files
table, cataln = prepare_subalignments_for_merge(dir_groups, dir_alns, dir_merged, args.algorithm)
# run final merge algorithm in mafft
run_merge(table, cataln, dir_merged, args.threads, args.polish, args.verbose)
if __name__ == '__main__':
main()