-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequence.py
131 lines (109 loc) · 3.82 KB
/
sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Amino acid or nucleotide sequence data type.
"""
import re
AMINO_ACIDS = "^[ABCDEFGHIKLMNPQRSTUVWYZX*-.]*$"
NUCLEOTIDES = "^[ATKMBVCNSWDGUYRH-.]*$"
IUPAC_CODES = (AMINO_ACIDS, NUCLEOTIDES)
# GAP_CHARACTERS = {"-", "?", "x"}
GAP_CHARACTERS = {"-"}
class Sequence(object):
"""
Represents a biological sequence. If no data type is provided, it will be
determined based on the file's extension or the sequence content.
"""
def __init__(self, description="", sequence_data=""):
self._description = str(description)
self._sequence_data = str(sequence_data)
self._is_alignment = True if self.is_alignment else False
if description:
self._otu = re.split(r"\||@", description)[0]
self._identifier = re.split(r"\||@", description)[1]
else:
self._otu = ""
self._identifier = ""
def __str__(self):
return self.description
def __len__(self):
return len(self.sequence_data)
def __nonzero__(self):
return True
def __bool__(self):
return True
@property
def description(self):
"A description or name of the sequence."
return self._description
@description.setter
def description(self, value):
self._description = value
@property
def sequence_data(self):
"""The raw sequence data."""
return self._sequence_data
@sequence_data.setter
def sequence_data(self, value):
self._sequence_data = value
@property
def otu(self):
"""
The operational taxonomical unit (OTU) to which this sequence belongs to.
"""
return self._otu
@otu.setter
def otu(self, value):
self._otu = value
@property
def identifier(self):
"""
A unique identifier for this sequence.
"""
return self._identifier
@identifier.setter
def identifier(self, value):
self._identifier = value
@property
def is_alignment(self):
"""
Returns True if the sequence contain at least one gap character. Only
dashes, '-' or dots '.' are considered to be gap characters.
"""
return bool("-" in self.sequence_data)
@is_alignment.setter
def is_alignment(self, value):
self._is_alignment = value
def _validate_sequence(self):
if not self.sequence_data:
return
for letter in self.sequence_data:
if letter not in IUPAC_CODES:
raise AssertionError('Non-IUPAC codes found in sequence \
{}.'.format(self.description))
def count(self, letter):
"""Returns the frequency of the provided letter in this sequence."""
return self._sequence_data.count(letter)
def gc_content(self):
"""
Return this sequence's GC content as a floating point number.
"""
seq_lower = self.sequence_data.lower()
nucleotides = 'acgt'
for base in seq_lower:
if not base in nucleotides:
print('Expected a nucleotide base (a, c, g, t), but found {} \
in sequence {}.'.format(base, self.description))
gc_count = seq_lower.count('g') + seq_lower.count('c')
return round(float(gc_count) / len(self.sequence_data), 2)
def ungapped(self):
"""
Return this sequence without any gap character. Gap characters that are
considered include dash ('-'), question mark ('?') and
uppercase/lowercase X.
"""
seq_ungapped = self.sequence_data.lower()
for character in GAP_CHARACTERS:
seq_ungapped = seq_ungapped.replace(character, "")
return seq_ungapped
def missing_data(self):
"Returns the numbers of gap characters within this sequence"
return len(self) - len(self.ungapped())