-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenerateTIS.py
352 lines (302 loc) · 15.2 KB
/
GenerateTIS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
"""
July 12, 2020
"""
"""
This python script generates a synthetic TIS dataset for training a prediction model.
It makes use of consensus sequence, upstream start codon, downstream stop codon, splice site, and nucleotide frequency to generate the dataset.
It takes a file containing the position weight matrix (pwm).
It creates two files as outputs, each containing the positive and negative samples.
It is a more simplified, randomized version of GenerateTIS.py and also has codon frequency added as a feature.
"""
import random
import json
'''
This function forms the basic structure of the synthetic sequence into a list of nucleotides.
It takes in the start codon and a certain length where len(seq) = length*2+3.
'''
def BasicStructure(start, length):
seq = []
seq += 'u' * length #upstream sequence
seq += start #start codon
seq += 'd' * length #downstream sequence
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function takes in either four numbers (probabilities) or a list of the four numbers
and gives a certain nucleotide that could be selected based on the probability.
'''
def PWMtoBase(A,C=None,G=None,T=None):
#If only a list is given, then separate it into the corresponding nucleotide.
if C == None and type(A) is list:
C,G,T = A[1],A[2],A[3]
A = A[0]
total = A+C+G+T
prob = random.uniform(1,total)
if prob <= A:
return 'A'
elif prob > A and prob <= A+C:
return 'C'
elif prob > A+C and prob <= A+C+G:
return 'G'
else:
return 'T'
'''
This function adds the consensus sequence around the start codon for the positive sample.
It takes a certain length, a sequence, a file with consensus, and l which is the length of consensus.
'''
def ConsensusSequence(length, seq, conFile):
with open(conFile, 'r') as fp:
consensus = json.load(fp) #consensus=dictionary with positions as keys and list of probabilities as value
#keys span from negative to positive number, with ATG being 0,1,2
for i in consensus.keys():
seq[length+int(i)] = PWMtoBase(consensus[i][0],consensus[i][1],consensus[i][2],consensus[i][3])
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function adds the upstream start codon to the negative samples.
It takes in the start codon, length, sequence, and the length of consensus sequence.
'''
def UpstreamStart(start, length, seq, l):
# there can be up to two upstream start codons
n = random.randint(0, 2)
# set in_len so that consensus sequence and any codon overlaping it will be disregarded for insert site.
in_len = int((length - (l + 5)) / 3) # divide by 3 to select each codon not nucleotide
# select random codon position to insert start codon
if n == 1:
pos1 = random.randint(1, in_len) # the first codon place is not considered to avoid cutting it out later on
seq[pos1 * 3:pos1 * 3 + 3] = start
elif n == 2:
pos1 = random.randint(1, in_len - 1)
pos2 = random.randint(pos1 + 1, in_len)
seq[pos1 * 3:pos1 * 3 + 3] = start
seq[pos2 * 3:pos2 * 3 + 3] = start
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function adds a downstream or upstream stop codon for negative or positive samples.
It takes a list of stop codons, length, sequence, and length of consensus, and either pos or neg.
'''
def Stop(stop_list, length, seq, l):
stop = random.choice(stop_list) #select a random stop codon
in_len = int(length / 3)
stop_site = random.randint(in_len + int((l+5)/3), in_len*2-2) # last codon place not considered to avoid cutting it out
seq[stop_site * 3:stop_site * 3 + 3] = stop
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function adds splice site consensus sequence to upstream for negative and downstream for positive sample.
It takes in length, sequence, length of consensus, 'pos' or 'neg' to indicate the sample.
'''
def SpliceSite(length, seq, l, train, cut1, cut2):
#Donor splice site
splice = [[352,361,154,132],[621,131,87,159],[85,45,777,91]]
#add splice site downstream for positive and upstream for negative sample
if train == 'pos':
n = random.randint(length+l+3, length*2+2-cut2-len(splice))
else:
n = random.randint(cut1, length-11-len(splice))
for i,s in enumerate(splice):
seq[n+i] = PWMtoBase(s)
#print(len(seq),"".join(i for i in seq))
return seq
"""
This function adds codons to the rest of the sequence.
It takes in length of sequence, the sequence, and two files that give the codon frequency of upstream and downstream.
"""
def CodonUsage(length, seq, CodonFile):
with open(CodonFile[0],'r') as fp:
up = json.load(fp) #dictionary of upstream codon frequency
with open(CodonFile[1],'r') as fp:
down = json.load(fp) #dictionary of downstream codon frequency
dna = ['A', 'C', 'G', 'T']
up_key,up_value,down_key,down_value = [],[],[],[] #list for storing codon and its frequency for up- and downstream
for (k1,v1),(k2,v2) in zip(up.items(),down.items()):
up_key.append(k1)
up_value.append(v1)
down_key.append(k2)
down_value.append(v2)
#add codon for upstream sequence
for u in range(0, length, 3):
#only add codons in position where no nucleotide is added to
check = any(i in seq[u:u+3] for i in dna)
if check == True:
continue
codon = random.choices(up_key, up_value)[0]
seq[u:u+3] = codon
# add codon for downstream sequence
for d in range(length+3, len(seq), 3):
# only add codons in position where no nucleotide is added to
check = any(i in seq[d:d + 3] for i in dna)
if check == True:
continue
codon = random.choices(down_key, down_value)[0]
seq[d:d+3] = codon
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function is used to fill all the bases in the sequence other than those already filled.
It takes in length, sequence, and either pos or neg to indicate the sample.
'''
def NucleotideFrequency(length, seq, train):
#negative training set
if train == 'neg':
for i in range(0,len(seq)):
if seq[i] not in ['u','d']: #do not add bases in positions already filled
continue
#fill with probability for both upstream and downstream of start codon
if i in range(0, length):
seq[i] = PWMtoBase(31.4,18.4,18.5,31.7)
elif i in range(length+3, len(seq)):
seq[i] = PWMtoBase(31.3,18.1,18.5,31.4)
#print("".join(i for i in seq))
return seq
#for positive training set
for j in range(0, len(seq)):
if seq[j] not in ['u','d']: #do not add bases in positions already filled
continue
# fill with probability for both upstream and downstream of start codon
if j in range(0, length):
seq[j] = PWMtoBase(31.1,19.6,15.5,33.8)
elif j in range(length + 3, len(seq)):
seq[j] = PWMtoBase(26.1,23.0,20.4,29.7)
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function generates TIS sequence by combining the above functions.
It takes in either 'pos' or 'neg, length, file containing consensus, length of consensus sequence,
codon usage files, donor, start and stop arguemtns.
'''
def Generate(train, fL, conFile, l, CodonFile, donor, startcheck=None, stopcheck=None):
#convert final length (fL) to two lengths that are equal and are divisible by 3
fL -= 3 #remove the length for start codon
cut = 0 #value needed to cut the sequence to fit final length
#add nucleotide to make length in upstream and downstream equal
if fL%2 == 0:
length = int(fL/2)
elif fL%2 == 1:
length = (fL//2)+1
cut += 1
#add nucleotide(s) to make length multiple of 3 (codon)
if length%3 == 1:
length += 2
cut += 2*2
elif length%3 == 2:
length += 1
cut += 1*2
#cut1 is for upstream and cut2 is for downstream
cut1 = cut//2
cut2 = cut - cut1
start = ['A','T','G']
stop_list = [['T', 'A', 'A'], ['T', 'A', 'G'], ['T', 'G', 'A']]
#generate the sequence
seq = BasicStructure(start, length)
if conFile!=None and train=='pos':
seq = ConsensusSequence(length, seq, conFile) #add consensus sequence
if startcheck==True and train=='pos':
seq = UpstreamStart(start, length, seq, l) #add upstream start codon
if stopcheck==True and train=='neg':
seq = Stop(stop_list, length, seq, l) #add downstream stop
if donor==True:
seq = SpliceSite(length, seq, l, train, cut1, cut2) #add splice site
if CodonFile!=None:
seq = CodonUsage(length, seq, CodonFile) #fill with codons
seq = NucleotideFrequency(length, seq, train) #fill the rest with nucleotides
#Remove any downstream stop codon in the positive set
if train == 'pos':
for p in range(0, len(seq), 3):
if p >= length+3 and seq[p:p + 3] in stop_list:
seq = seq[:p] + [PWMtoBase(26.1, 23.0, 20.4, 29.7)] \
+ [PWMtoBase(26.1, 23.0, 20.4, 29.7)] \
+ [PWMtoBase(26.1, 23.0, 20.4, 29.7)] + seq[p+3:]
#cut the sequence to match the final length (fL)
seq = seq[cut1:len(seq)-cut2]
#print(len(seq),"".join(i for i in seq))
return seq
'''
This function writes the generated TIS dataset into a file.
It takes in the number of sequences, output files for positive and negative set, final length, length of consensus,
and six arguments for features: a file containing consensus, boolean for start and stop codon, donor splice sites,
and a list of files containing codon usage.
It outputs the two file containing synthetic datasets.
'''
def WriteTIS(rows, posFile, negFile,fL=300,l=10,conFile=None,startcheck=True,stopcheck=True,donor=True,CodonFile=None):
tis_pos = open(posFile, "w+")
tis_neg = open(negFile, "w+")
print("consensus: {}, start: {}, stop: {}, donor: {}, codon: {}".format(
isinstance(conFile,str),startcheck,stopcheck,donor,isinstance(CodonFile,list)))
if CodonFile!=None:
posCodon = CodonFile[0:2] #CodonFile for positive set
negCodon = CodonFile[2:4] #CodonFile for negative set
else:
posCodon, negCodon = None, None
#Generate TIS dataset
for i in range(0, rows):
positive = "".join(i for i in Generate('pos',fL,conFile,l,posCodon,donor,startcheck=startcheck,stopcheck=stopcheck))
negative = "".join(j for j in Generate('neg',fL,conFile,l,negCodon,donor,startcheck=startcheck,stopcheck=stopcheck))
tis_pos.write(positive+'\n')
tis_neg.write(negative+'\n')
tis_pos.close()
tis_neg.close()
print("########## {:^50s} ##########".format("Start generating synthetic datasets"))
#Dataset with all features
WriteTIS(rows=27102,posFile='arabTIS.pos',negFile='arabTIS.neg',conFile='consensus_sequence.txt')
print("########## {:^50} ##########".format("Generated TIS"))
print()
#Dataset without consensus sequence
print("########## {:^50s} ##########".format("Start generating synthetic dataset without consensus sequence"))
WriteTIS(rows=27102,posFile='arabTIScon.pos',negFile='arabTIScon.neg')
print("########## {:^50} ##########".format("Generated TIS without consensus sequence"))
print()
#Dataset without upstream start codon
print("########## {:^50s} ##########".format("Start generating synthetic dataset without upstream start codon"))
WriteTIS(rows=27102,posFile='arabTISstart.pos',negFile='arabTISstart.neg',conFile='consensus_sequence.txt',startcheck=False)
print("########## {:^50} ##########".format("Generated TIS without upstream start codon"))
print()
#Dataset wtihout stop codon
print("########## {:^50s} ##########".format("Start generating synthetic dataset without stop codon"))
WriteTIS(rows=27102,posFile='arabTISstop.pos',negFile='arabTISstop.neg',conFile='consensus_sequence.txt',stopcheck=False)
print("########## {:^50} ##########".format("Generated TIS without stop codon"))
print()
#Dataset without donor splice sites
print("########## {:^50s} ##########".format("Start generating synthetic dataset without donor splice site"))
WriteTIS(rows=27102,posFile='arabTISdonor.pos',negFile='arabTISdonor.neg',conFile='consensus_sequence.txt',donor=False)
print("########## {:^50} ##########".format("Generated TIS without donor splice site"))
print()
#Dataset with codon usage
print("########## {:^50s} ##########".format("Start generating synthetic dataset with codon usage"))
c = ['codonUp_pos.txt','codonDown_pos.txt','codonUp_neg.txt','codonDown_neg.txt']
WriteTIS(rows=27102,posFile='arabTIScodon.pos',negFile='arabTIScodon.neg',conFile='consensus_sequence.txt',CodonFile=c)
print("########## {:^50} ##########".format("Generated TIS with codon usage"))
print()
print()
print("########## {:^50s} ##########".format("Start generating synthetic dataset with only consensus"))
#Dataset with only consensus
WriteTIS(rows=27102,posFile='arabTIS_wcon.pos',negFile='arabTIS_wcon.neg',conFile='consensus_sequence.txt',startcheck=False,stopcheck=False,donor=False)
print("########## {:^50} ##########".format("Generated TIS with consensus"))
print()
print("########## {:^50s} ##########".format("Start generating synthetic dataset with only upstream start"))
#Dataset with only upstream start codon
WriteTIS(rows=27102,posFile='arabTIS_wstart.pos',negFile='arabTIS_wstart.neg',stopcheck=False,donor=False)
print("########## {:^50} ##########".format("Generated TIS with upstream start"))
print()
print("########## {:^50s} ##########".format("Start generating synthetic dataset with only dowstream stop"))
#Dataset with only stop codon
WriteTIS(rows=27102,posFile='arabTIS_wstop.pos',negFile='arabTIS_wstop.neg',startcheck=False,donor=False)
print("########## {:^50} ##########".format("Generated TIS with stop"))
print()
print("########## {:^50s} ##########".format("Start generating synthetic dataset with only donor"))
#Dataset with only donor splice site
WriteTIS(rows=27102,posFile='arabTIS_wdonor.pos',negFile='arabTIS_wdonor.neg',startcheck=False,stopcheck=False)
print("########## {:^50} ##########".format("Generated TIS with only donor"))
print()
print("########## {:^50s} ##########".format("Start generating synthetic dataset with only codon"))
#Dataset with only codon
c = ['codonUp_pos.txt','codonDown_pos.txt','codonUp_neg.txt','codonDown_neg.txt']
WriteTIS(rows=27102,posFile='arabTIS_wcodon.pos',negFile='arabTIS_wcodon.neg',startcheck=False,stopcheck=False,donor=False,CodonFile=c)
print("########## {:^50} ##########".format("Generated TIS with only codon"))
print()
print("########## {:^50s} ##########".format("Start generating synthetic dataset with only nucleotide"))
#Dataset with only nucleotide frequnecy
WriteTIS(rows=27102,posFile='arabTIS_wnuc.pos',negFile='arabTIS_wnuc.neg',startcheck=False,stopcheck=False,donor=False)
print("########## {:^50} ##########".format("Generated TIS with only nucleotide"))
print()