-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRandomizedMotif.py
146 lines (129 loc) · 5.87 KB
/
RandomizedMotif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# a Monte Carlo algorithm
# this is not guaranteed to return exact solutions, but they do quickly find approximate solutions
import random
def randomMotifs(dna,k,t):
kmm = []
sc = []
k = 3
D = {}
for i in range(0,len(dna)):
km = []
for kk in range(len(dna[i])-k+1):
km += [dna[i][kk:kk+k]]
D[i] = km
for m in range(0,t):
ran = random.randint(0,len(D[0])-1)
kmm += [D[m][ran]]
return kmm
def ProfileWithPseudocounts(Motifs):
t = len(Motifs)
k = len(Motifs[0])
profile = CountWithPseudocounts(Motifs) # output variable
for symbol in profile:
for kk in range(0,len(profile[symbol])):
profile[symbol][kk] = profile[symbol][kk]/(len(Motifs) + 4)
return profile
def CountWithPseudocounts(Motifs):
count = {}
for i in 'ACGT':
count[i] = []
for ii in range(len(Motifs[0])):
count[i].append(1)
for i in range(len(Motifs)):
for j in range(len(Motifs[0])):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def Score(Motifs):
count = 0
L = Consensus(Motifs)
for i in Motifs:
for chr1, chr2 in zip(i,L):
if chr1 != chr2:
count += 1
return count
def Consensus(Motifs):
k = len(Motifs[0])
count = Count(Motifs)
consensus = ""
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
count = {}
for i in 'ACGT':
count[i] = []
for ii in range(len(Motifs[0])):
count[i].append(0)
for i in range(len(Motifs)):
for j in range(len(Motifs[0])):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def RandomMotifs(dna,k,t):
kmm = []
sc = []
D = {}
for i in range(0,len(dna)):
km = []
for kk in range(len(dna[i])-k+1):
km += [dna[i][kk:kk+k]]
D[i] = km
for m in range(0,t):
ran = random.randint(0,len(D[0])-1)
kmm += [D[m][ran]]
return kmm
def Motifs(pf,dna):
k = len(pf['A'])
D = []
for i in range(0,len(dna)):
km = []
sc = []
for kk in range(len(dna[i])-k+1):
km += [dna[i][kk:kk+k]]
for i in km:
sc += [Pr(i,pf)]
D += [km[sc.index(max(sc))]]
return D
def Pr(Text, Profile):
p = 1
for i in range(0,len(Text)):
p *= Profile[Text[i]][i]
return p
def RandomizedMotifSearch(Dna, k, t):
M = RandomMotifs(Dna, k, t)
BestMotifs = M
while True:
Profile = ProfileWithPseudocounts(M)
M = Motifs(Profile, Dna)
if Score(M) < Score(BestMotifs):
BestMotifs = M
else:
return BestMotifs
Dna = ["GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC"]
# set t equal to the number of strings in Dna, k equal to 15, and N equal to nb of bases per string.
t = 10
k = 15
N = 100
BestMotifs = RandomizedMotifSearch(Dna, k, t)
for i in range(N):
m = RandomizedMotifSearch(Dna, k, t)
if Score(m) < Score(BestMotifs):
BestMotifs = m
print(BestMotifs)
print (Score(BestMotifs))