-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorthoquoll.py
executable file
·234 lines (193 loc) · 7.67 KB
/
orthoquoll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python3
# Author: Felix Thalen
"Extract statistics from a directory of alignments."
from multiprocessing import Pool
from multiprocessing import cpu_count
import argparse
import sys
import os
import statistics
import fasta
import fasttree
import mafft
import newick
import supermatrix
import time
import pathlib
FASTA_EXTENSIONS = {".fa", ".fas", ".fasta", ".fna", ".faa", ".fsa", ".ffn",
".frn"}
NEWICK_EXTENSIONS = {".nw", ".newick", ".tre", ".tree", ".treefile"}
ALL_EXTENSIONS = FASTA_EXTENSIONS | NEWICK_EXTENSIONS
HEADER = "id;alignments;sequences;otus;meanSequences;meanOtus;meanSeqLen;\
shortestSeq;longestSeq;pctMissingData;catAlignmentLen;minTreeDiameter;\
maxTreeDiameter;meanTreeDiameter;medianTreeDiameter\n"
VERSION = "0.1.0"
RESET = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
def parse_args():
'Parse the user-provided arguments.'
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('alignments',
metavar='PATH',
nargs='+',
type=str,
help='path to a directory that contains multiple FASTA \
files or one or more FASTA files')
parser.add_argument('--id',
metavar='STRING',
type=str,
default='unknown',
help='give this supermatrix a custom name (default: \
unknown)')
parser.add_argument('--realign',
action='store_true',
default=False,
help='realign all alignments using MAFFT\'s LINSI \
algorithm')
parser.add_argument('--subdirs',
action='store_true',
default=False,
help='search for files in subdirectories')
parser.add_argument('--no-trees',
action='store_true',
default=False,
help='do not infer phylogenetic trees and do not \
report tree diameter statistics (much faster)')
parser.add_argument('--no-header',
action='store_true',
default=False,
help='if provided, do not include a header in the \
CSV output')
parser.add_argument('--threads',
metavar='COUNT',
default=cpu_count(),
help='number of threads used for running MAFFT and \
FastTree (default: all available')
parser.add_argument('--overwrite',
action='store_true',
default=False,
help='if provided, overwrite any existing file with \
the same output path')
parser.add_argument('--output',
metavar='PATH',
help='path to the output file, pre-existing files are \
overwritten! (default: supermatrix_stats.csv)',
default='supermatrix_stats.csv')
return parser.parse_args(args=None if sys.argv[1:] else ['--help'])
def is_fasta(path):
"Returns True if the provided path is the path to a FASTA file"
filename, extension = os.path.splitext(path)
filename = filename.split(".")[0]
extension = extension.lower()
if extension in FASTA_EXTENSIONS:
return True
return False
def is_newick(path):
"Returns True if the provided path is the path to a Newick file."
filename, extension = os.path.splitext(path)
filename = filename.split(".")[0]
extension = extension.lower()
if extension in NEWICK_EXTENSIONS:
return True
return False
def fasta_files(directory, extensions=FASTA_EXTENSIONS, subdirs=False):
"""
Returns a list of all of the FASTA files within the provided directory.
Searches for files within all of the directories' subdirectorie(s) if the
subdirs argument is provided. Produces an error message and exits with
an exit code of 1, in case no files were found.
"""
files = []
file_pairs = dict()
rootdir = pathlib.Path(directory)
directories = [rootdir]
if subdirs:
directories += [path for path in rootdir.iterdir() if path.is_dir()]
for subdirectory in directories:
files += [path for path in subdirectory.iterdir() if path.suffix in extensions]
if len(files) < 1:
print('no FASTA file pairs found in the provided path')
sys.exit()
return files
def tree_stats(tree_files):
"""
Takes a list of tree files as an input and returns a dictionary containing
the following tree statistics: max, min, mean, and median.
"""
diameters = []
for tree_file in tree_files:
node = newick.read(tree_file)
diameters.append(node.tree_diameter())
stats = {
"min": min(diameters),
"max": max(diameters),
"mean": statistics.mean(diameters),
"median": statistics.median(diameters)
}
return stats
def main():
"""
Generate statistics from multiple sequence alignments (MSAs) in the
provided directory.
"""
START_TIME = time.time()
args = parse_args()
print("{}OrthoQuoll{} {}".format(BOLD, RESET, VERSION))
print("author: Felix Thalen <[email protected]>\n")
alignments = []
for alignment in args.alignments:
alignment_path = pathlib.Path(alignment)
if alignment_path.is_file() and alignment_path.suffix in FASTA_EXTENSIONS:
alignments.append(alignment)
elif alignment_path.is_dir():
alignments += fasta_files(alignment, FASTA_EXTENSIONS, args.subdirs)
msas = []
for file in alignments:
# cat_msas.msas.append(fasta.read(file))
msas.append(fasta.read(str(file)))
# Realign everything if '--realign' is set
if args.realign:
print("Realigning provided alignments using MAFFT (L-INS-i algorithm)")
realigned_files = []
with Pool(processes=int(args.threads)) as pool:
for alignment in pool.imap_unordered(mafft.run_mafft, alignments):
realigned_files.append(alignment)
alignments = realigned_files
if args.overwrite and os.path.exists(args.output):
os.remove(args.output)
# Generate tree diameter statistics (if not unset)
if not args.no_trees:
# Generate trees from all of the alignments
print("Generating trees using FastTree")
trees = []
with Pool(processes=int(args.threads)) as pool:
for tree in pool.imap_unordered(fasttree.run_fasttree, alignments):
trees.append(tree)
if not args.no_header:
with open(args.output, 'a') as msas_stats_file:
msas_stats_file.write(HEADER)
tree_diameter_stats = tree_stats(trees)
else:
print("Skipping tree inference and tree diameter statistics")
tree_diameter_stats = {
"min": 0.0,
"max": 0.0,
"mean": 0.0,
"median": 0.0
}
cat_msas = supermatrix.Supermatrix()
for file in alignments:
cat_msas.msas.append(fasta.read(file))
cat_msas.report(args.id, args.output, tree_diameter_stats)
# cleanup temporary files
if args.realign and not args.no_trees:
for path in realigned_files + trees:
os.remove(path)
elif not args.no_trees:
for path in trees:
os.remove(path)
print("\ncompleted in {} seconds".format(
round(time.time() - START_TIME, 2)))
if __name__ == '__main__':
main()