-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
122 lines (100 loc) · 3.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import sys, os
import argparse
import random
import matplotlib.pyplot as plt
from standard_needleman import needleman_wunsch
from utils import compute
from anchored_needleman import anchored_needleman_wunsch
import time
import numpy as np
def make_arg_parser():
parser = argparse.ArgumentParser(prog='main.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-q","--query",
default=argparse.SUPPRESS,
required=True,
help="Path to query fasta [required]")
parser.add_argument("-r","--reference",
default=argparse.SUPPRESS,
required=True,
help="Path to reference fasta [required]")
parser.add_argument("-m","--matches",
default=None,
help="Path to Matches file [optional]")
return parser
def read_sequence(filepath):
with open(filepath) as f:
s = f.readlines()
res = ''
for i in range(1,len(s)):
res += s[i].rstrip()
return res
def read_matches(filepath):
matches = []
with open(filepath) as f:
for line in f:
line_list = line.strip().split("\t")
line_list = [int(i) for i in line_list] # convert to int
matches.append(line_list)
return matches
# python main.py -q Human_HOX.fa -r Fly_HOX.fa -m Match_HOX.txt
if __name__ == '__main__':
parser = make_arg_parser()
args = parser.parse_args()
Q = read_sequence(filepath = args.query)
R = read_sequence(filepath = args.reference)
if args.matches == None:
matches_ = None
else:
matches_ = read_matches(filepath=args.matches)
gap_penalty = -2
match_score = 1
mismatch_score = -3
if matches_ == None:
# Standard Needleman-Wunsch
[score, query_with_gaps, ref_with_gaps] = needleman_wunsch(Q = Q, R = R,
gap_penalty = gap_penalty, match_score = match_score, mismatch_score = mismatch_score)
else:
# Anchored Needleman-Wunsch
[score, query_with_gaps, ref_with_gaps] = anchored_needleman_wunsch(Q = Q, R = R,
matches = matches_, gap_penalty = gap_penalty, match_score = match_score, mismatch_score = mismatch_score)
# print(compute(query_with_gaps,ref_with_gaps))
print('Score: ' + str(score) + '\n')
print('Aligned Query Sequence: ' + '\n' + query_with_gaps + '\n' )
print('Aligned Reference Sequence: ' + '\n' + ref_with_gaps + '\n')
# Random sequences 10,000 times
scores_random_sequences = []
t1 = time.clock()
idx = 1
Q_ = list(Q)
R_ = list(R)
for i in range(10000):
print(idx)
idx+=1
Q_ = np.random.permutation(Q_)
R_ = np.random.permutation(R_)
# random.shuffle(Q_)
# random.shuffle(R_)
Q_rand = "".join(Q_)
R_rand = "".join(R_)
if matches_ == None:
# Standard Needleman-Wunsch
[curr_score,_,_] = needleman_wunsch(Q = Q_rand, R = R_rand,
gap_penalty = gap_penalty, match_score = match_score, mismatch_score = mismatch_score)
else:
# Anchored Needleman-Wunsch
[curr_score,_,_] = anchored_needleman_wunsch(Q = Q_rand, R = R_rand,
matches = matches_, gap_penalty = gap_penalty, match_score = match_score, mismatch_score = mismatch_score)
scores_random_sequences.append(curr_score)
t2 = time.clock()
print(t2 - t1)
# print(scores_random_sequences)
# print(len(scores_random_sequences))
# Plotting
n, bins, patches = plt.hist(x= scores_random_sequences, bins='auto',
color='#52D017', alpha=0.7, rwidth=0.85) # Yellow Green
plt.axvline(x = score, linewidth = 3, color = 'r')
plt.title('Alignment Scores for 10000 Random Sequences')
plt.xlabel('Alignment Score')
plt.ylabel('Frequency')
curr_plot_name = 'Plot_HOX.png'
plt.savefig(curr_plot_name)