-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta.py
48 lines (44 loc) · 1.69 KB
/
fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
Module for working with the FASTA file format.
"""
from __future__ import absolute_import
from os import path
from textwrap import wrap
import msa
def read(filename):
"""
Takes the path to a file in FASTA format and returns a list of Sequence
objects for the sequences contained within that file. You can provide a
data type (DNA/RNA/Protein) for the sequence or the data type will be
guessed. Providing a data type can potentially speed up the process.
"""
sequence_data = ""
description = ""
extension = path.splitext(filename)[1]
alignment = msa.MultipleSequenceAlignment(filename, extension)
with open(filename) as fasta_file:
for line in fasta_file:
line = line.rstrip()
if line.startswith(">"):
if sequence_data:
alignment.add_sequence(None, description, sequence_data)
description = line[1:]
sequence_data = ""
else:
sequence_data += line
if sequence_data:
alignment.add_sequence(None, description, sequence_data)
return alignment
def write(alignment, max_column):
"""
Takes a multiple sequence alignment object and a path as an input. Writes
all sequences within that alignment to the provided filename.
"""
with open(alignment.filename, "w") as fasta_file:
for sequence in alignment.sequences:
if max_column:
seq_data = "\n".join(wrap(sequence.sequence_data, max_column))
else:
seq_data = sequence.sequence_data
fasta_file.write(">{}\n".format(sequence.description))
fasta_file.write("{}\n".format(seq_data))