-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_simulated_datasets.py
313 lines (292 loc) · 10.6 KB
/
generate_simulated_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#!/usr/bin/env python
# coding: utf-8
"""
Created on Tue Dec 14 19:32:12 2021
@author: James.Pettengill
"""
from Bio import SeqIO
import pandas as pd
import subprocess
from io import BytesIO
import argparse
import sys
import os
def get_results(
in_file, primer_amplicon, id_file, number_reads, read_length, fragment_length
):
### Get reference genomes out of in_file
print("Getting reference ids and proportions")
ref_ids = {}
with open(id_file, "r") as infile:
for line in infile:
ref_ids[line.split("\t")[0]] = line.split("\t")[1].replace("\n", "")
for ref_id, proportion in ref_ids.items():
with open(ref_id + ".fasta", "w") as f:
fasta_sequences = SeqIO.parse(open(in_file), "fasta")
for seq in fasta_sequences:
if ref_id in seq.id:
SeqIO.write([seq], f, "fasta")
### Get primers in in_silico_pcr.pl format
print("Reading primer tsv file")
primer_info = pd.read_csv(primer_amplicon, header=0, sep="\t")
primer_info["name"] = primer_info["name"].str.replace("varskip-0317-1_", "varskip_")
primer_info["name"] = primer_info["name"].str.replace(
"varskip_long-", "varskip_long_"
)
primer_info["name"] = primer_info["name"].str.replace("SARS-CoV-", " SARS_CoV_")
primer_info["output_prefix"] = primer_info.name.replace(
"_LEFT.*|_RIGHT.*|-.*", "", regex=True
)
amplicon_ids = list(set(list(primer_info["output_prefix"])))
primer_info_formatted = pd.DataFrame()
for amplicon in amplicon_ids:
primer_dict = {}
tmp = primer_info[primer_info["output_prefix"] == amplicon]
primer_info_left = list(tmp[tmp["name"].str.contains("LEFT")]["seq"])
primer_info_right = list(tmp[tmp["name"].str.contains("RIGHT")]["seq"])
primer_pairs = []
for left_primer in primer_info_left:
for right_primer in primer_info_right:
primer_pairs.append([left_primer, right_primer])
primer_dict[amplicon] = primer_pairs
primer_df = pd.DataFrame.from_dict(primer_dict)
primer_df = pd.DataFrame(
primer_df[amplicon].to_list(), columns=["LEFT", "RIGHT"]
)
primer_df["Amplicon"] = amplicon
primer_info_formatted = primer_info_formatted.append(primer_df)
primer_info_formatted.to_csv(
os.path.basename(primer_amplicon).split(".")[0] + "_for_in_silico_pcr.tsv",
index=None,
header=False,
sep="\t",
)
### Run in_silico_pcr.pl
print("Running in_silico_pcr.pl")
for ref_id, proportion in ref_ids.items():
cmd = (
"in_silico_PCR.pl -p "
+ primer_amplicon.split(".")[0]
+ "_for_in_silico_pcr.tsv -s "
+ ref_id
+ ".fasta"
)
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
results = proc.stdout.read()
try:
tmp = pd.read_csv(BytesIO(results), sep="\t", header=0)
tmp.to_csv(
ref_id + "_" + os.path.basename(primer_amplicon).split(".")[0] + ".bed",
index=None,
sep="\t",
)
except pd.errors.EmptyDataError:
print(
"\n***ERROR****\nProblems reading in_silico_pcr.pl results - make sure it is installed"
)
sys.exit(1)
### Generate amplicon fasta file from bed file
bed_info = {}
for ref_id, proportion in ref_ids.items():
bed_info[ref_id] = []
with open(
ref_id + "_" + os.path.basename(primer_amplicon).split(".")[0] + ".bed", "r"
) as bed_file:
for line in bed_file:
if "AmpId" not in line and "No amplification" not in line:
name = line.split("\t")[0]
begin = line.split("\t")[2]
end = line.split("\t")[3].replace("\n", "")
bed_info[ref_id].append([name, begin, end])
### Write out a single file of amplicons adjusting for the proportion of each variant
output_prefix = "_".join(
"{}_{}".format(key, value) for key, value in ref_ids.items()
)
number_amplicons = []
with open(output_prefix + "_" + "amplicons.fasta", "w") as outfile:
for ref_id, values in bed_info.items():
fasta_sequences = SeqIO.parse(open(ref_id + ".fasta"), "fasta")
proportion = int(ref_ids[ref_id])
for seq in fasta_sequences:
if ref_id in seq.id:
for i in range(1, proportion):
for value in values:
## Get count of amplicons to scale number of reads
number_amplicons.append(value)
amplicon = seq.seq[
int(float(value[1]) - 1) : int(
(float(value[1]) + float(value[2])) - 1
)
]
outfile.write(
">"
+ ref_id
+ "_"
+ value[0]
+ "\n"
+ str(amplicon)
+ "\n"
)
number_amplicons = len(number_amplicons)
### Generate amplicon files for each genome
for ref_id, values in bed_info.items():
fasta_sequences = SeqIO.parse(open(ref_id + ".fasta"), "fasta")
for seq in fasta_sequences:
if ref_id in seq.id:
with open(
ref_id + "_" + primer_amplicon.split(".")[0] + "amplicons.fasta",
"w",
) as outfile:
for value in values:
amplicon = seq.seq[
int(float(value[1]) - 1) : int(
(float(value[1]) + float(value[2])) - 1
)
]
outfile.write(
">" + ref_id + "_" + value[0] + "\n" + str(amplicon) + "\n"
)
## Run art and adjust coverage to account for total number of reads desired
input_id = output_prefix + "_" + "amplicons.fasta"
output_id = output_prefix + "art_out"
print("Running art for ", input_id)
try:
subprocess.call(
[
"art_illumina",
"-ss",
"MSv3",
"-p",
"-na",
"-i",
input_id,
"-l",
str(read_length),
"-s",
"10",
"-m",
str(fragment_length),
"-f",
str(round(int(number_reads) / int(number_amplicons))),
"-o",
output_id,
]
)
except subprocess.CalledProcessError as e:
print(
f"\n***ERROR***\nProblems running art - make sure it is installed and the fasta input file isn't empty. Error: {e}"
)
sys.exit(1)
except Exception as e:
print(
f"\n***ERROR***\nAn unexpected error occurred: {e}"
)
sys.exit(1)
### Generate single art files per reference genome
for ref_id, proportion in ref_ids.items():
input_id = ref_id + "_" + primer_amplicon.split(".")[0] + "amplicons.fasta"
output_id = ref_id + "_" + primer_amplicon.split(".")[0] + "art_out"
subprocess.call(
[
"art_illumina",
"-ss",
"MSv3",
"-p",
"-na",
"-i",
input_id,
"-l",
str(read_length),
"-s",
"10",
"-m",
str(fragment_length),
"-f",
str(1),
"-o",
output_id,
]
)
def parse_arguments(system_args):
"""
Parse command line arguments
"""
usage = """Takes a SC2 reference fasta, tsv file with amplicon id, start position, and length (output from in_silico_pcr.pl); \
tsv file with amplicon id and number of copies used to simulate differential coverage. \
Must have in_silico_pcr.pl and art installed and in in your path.
Example usage:
generate_reads.py -f relevantGISAID.fa -i cdc_epi_isl.txt -p primer_info_vss1a.primer.tsv """
parser = argparse.ArgumentParser(
description=usage, formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
"-f",
"--in-file",
metavar="FILE",
dest="in_file",
default=None,
required=True,
help="Path to reference SC2 reference fastas.",
)
parser.add_argument(
"-i",
"--id-file",
metavar="FILE",
dest="id_file",
default=None,
required=True,
help="Path to new line delimited list of epi isls in SC2 reference fastas.",
)
parser.add_argument(
"-p",
"--primer-amplicon",
metavar="FILE",
dest="primer_amplicon",
default=None,
help="Path to file with of format https://raw.githubusercontent.com/primer_infoiolabs/VarSkip/main/primer_info_vss1a.primer.tsv",
)
parser.add_argument(
"-n",
"--number-reads",
metavar="INT",
dest="number_reads",
default=10000,
help="Total number of reads to generate via art. Default is 10,000",
)
parser.add_argument(
"-l",
"--read-length",
metavar="INT",
dest="read_length",
default=150,
help="Read length for art simulations.",
)
parser.add_argument(
"-r",
"--fragment-length",
metavar="INT",
dest="fragment_length",
default=400,
help="Expected fragment length.",
)
# parser.add_argument("-f", "--frequency-amplicons", metavar="FILE", dest="frequency_amplicons", default=None, help="Path to file with amplicon id and frequency")
args = parser.parse_args(system_args)
return args
def main(args):
"""
Generate results
"""
# All args
in_file = args.in_file
id_file = args.id_file
primer_amplicon = args.primer_amplicon
number_reads = args.number_reads
fragment_length = args.fragment_length
read_length = args.read_length
# frequency_amplicons = args.frequency_amplicons
get_results(
in_file, primer_amplicon, id_file, number_reads, read_length, fragment_length
)
if __name__ == "__main__":
args = parse_arguments(sys.argv[1:])
main(args)