-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign.py
78 lines (57 loc) · 1.87 KB
/
align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from utils import Bunch
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
from Bio.Seq import Seq
def align(seq1, seq2):
matrix = matlist.pam60
gap_open = -5
gap_extend = -1
alns = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
top_aln = alns[0]
aln_seq1, aln_seq2, score, begin, end = top_aln
return aln_seq1, aln_seq2
def get_triplet(seq, pos, protein):
if protein != '-':
triplet_nucl = seq[pos : pos + 3]
pos += 3
else:
triplet_nucl = '---'
return triplet_nucl
def protein_align(seq1, seq2):
aseq1 = aseq2 = ''
apseq1, apseq2 = align(
seq1.aseq[:-1],
seq2.aseq[:-1]
)
mismatch = pos1 = pos2 = 0
mutations = {}
for n in range(len(apseq1)):
triplet_nucl1 = get_triplet(seq1.seq, pos1, apseq1[n])
pos1 += 3 if triplet_nucl1 != '---' else 0
aseq1 += triplet_nucl1
triplet_nucl2 = get_triplet(seq2.seq, pos2, apseq2[n])
pos2 += 3 if triplet_nucl2 != '---' else 0
aseq2 += triplet_nucl2
each_mismatch = 0
for i in range(3):
if(triplet_nucl1[i] != triplet_nucl2[i]):
if(apseq1[n] != apseq2[n]):
mutations[3 * n + i] = 'protein'
else:
mutations[3 * n + i] = 'nucleotide'
each_mismatch += 1 if triplet_nucl1[i] != triplet_nucl2[i] else 0
if apseq1[n] != apseq2[n]:
each_mismatch *= 2
mismatch += each_mismatch
return Bunch(score=mismatch, mutations=mutations, aseq1=aseq1, aseq2=aseq2, aaseq1=apseq1, aaseq2=apseq2)
def pairwise_align(sequences_by_species):
sequences = []
for key, value in sequences_by_species.items():
sequences += value
sequences_count = len(sequences)
alignments = [[ None for i in range(sequences_count)] for j in range(sequences_count)]
for i in range(sequences_count):
print(sequences[i].specie)
for j in range(sequences_count):
alignments[i][j] = protein_align(sequences[i], sequences[j])
return Bunch(sequences=sequences, alignments=alignments)