-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkappapcrrun.py
executable file
·254 lines (233 loc) · 8.43 KB
/
kappapcrrun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import os
import pipelineparams as params
from states import States
from run import Run
import fastq
class KappaPcrRun(Run):
def __init__( self, id, full_path_of_run_dir, state=None ):
Run.__init__(self,id,full_path_of_run_dir,state)
self.type_of_runs = "KappaPcr"
self.transition = {
# curr_state, function, success_state, fail_state
States.new: (KappaPcrRun.CheckRegisteredVerbose,
States.wait_for_data,
States.check_registered ),
States.check_registered: (KappaPcrRun.CheckRegisteredSilent,
States.wait_for_data,
States.check_registered),
States.wait_for_data: (KappaPcrRun.CheckTransferComplete,
States.check_if_miseq,
States.wait_for_data),
States.check_if_miseq: (KappaPcrRun.CheckIfMiseq,
States.check_if_kappapcr,
States.error_detected),
States.check_if_kappapcr: (KappaPcrRun.CheckIfKappaPcr,
States.kappapcr_sample_sheet,
States.error_detected),
States.kappapcr_sample_sheet: (KappaPcrRun.KappaPcrSampleSheet,
States.kappapcr_demultiplex,
States.error_detected),
States.kappapcr_demultiplex: (KappaPcrRun.KappaPcrDemultiplex,
States.kappapcr_postprocess,
States.error_detected),
States.kappapcr_postprocess: (KappaPcrRun.KappaPcrPostprocess,
States.kappapcr_distribute,
States.error_detected),
States.kappapcr_distribute: (KappaPcrRun.KappaPcrDistribute,
States.qc,
States.error_detected),
States.qc: (KappaPcrRun.Qc_18,
States.archive,
States.error_detected),
States.archive: (KappaPcrRun.Archive,
States.complete,
States.error_detected),
States.error_detected: (KappaPcrRun.NotifyError,
States.error_notified,
States.error_detected),
}
def KappaPcrDistribute( self ):
"""
Copies data from Patch PCR runs to GNomEx.
"""
self.CopyDataFiles()
return False
def KappaPcrSampleSheet( self ):
"""Creates sample sheet for multiplexed flow cell, and writes
it to the BaseCalls directory within the run directory.
Returns full path of sample sheet file name.
"""
self.Log("Generating patch PCR run sample sheet for run %s." % self.id )
# Generate sample sheet name. If it already exists just
# return its name. Doing this to make it easier to override
# the automatic sample sheet creation. Brett Milash, 10/11/2013.
self.sample_sheet = self.SampleSheetName()
if os.path.exists(self.sample_sheet):
self.Log("Sample sheet for run %s already exists." % self.id)
return True
# Select lanes from flow cell that are bar coded.
try:
query = """select sample.number,
sample.barcodesequence,
request.number
from flowcell
join flowcellchannel on flowcellchannel.idflowcell=flowcell.idflowcell
join sequencelane on sequencelane.idflowcellchannel =
flowcellchannel.idflowcellchannel
join sample on sequencelane.idsample = sample.idsample
left outer join genomebuild on sequencelane.idgenomebuildalignto =
genomebuild.idgenomebuild
join request on sequencelane.idrequest = request.idrequest
join appuser on request.idappuser = appuser.idappuser
where flowcellchannel.filename = '%s'
order by flowcellchannel.number, sample.number;""" % self.id
c = self.Query( query )
except pymssql.OperationError:
print query
raise
results = c.fetchall()
# Open file
ofs = open(self.sample_sheet,'w')
# Copy the header from the existing sample sheet.
ifs = open(os.path.join(self.dirname,'SampleSheet.csv'))
rec = ifs.readline()
while rec and not rec.startswith('Sample_ID'):
ofs.write(rec)
rec = ifs.readline()
# Write the Sample_ID header.
ofs.write(rec)
ifs.close()
# Write the results.
s = [''] * 10
for rec in results:
# Write row.
# Sample id.
s[0]=rec[0]
# Barcode.
s[5]=rec[1]
# Request number
s[8]=rec[2]
ofs.write(','.join(s)+ '\n')
# Close file.
ofs.close()
return True
def CheckIfKappaPcr( self ):
"""
Determines if a miseq run is a patch pcr run.
"""
self.Log("Checking if run %s is a kappa pcr run." % self.id)
query = """select distinct application.codeapplication, flowcellchannel.filename, application.application
from application
join seqlibprotocolapplication
on application.codeapplication = seqlibprotocolapplication.codeapplication
join seqlibprotocol on seqlibprotocol.idseqlibprotocol = seqlibprotocolapplication.idseqlibprotocol
join sample on sample.idseqlibprotocol = seqlibprotocol.idseqlibprotocol
join sequencelane on sequencelane.idsample = sample.idsample
join flowcellchannel on sequencelane.idflowcellchannel = flowcellchannel.idflowcellchannel
where application.application like '%%Kappa PCR%%' and
flowcellchannel.filename = '%s';""" % self.id
c = self.Query(query)
results = c.fetchall()
print(results)
is_patchpcr = len(results)>0
if is_patchpcr:
self.Log("Run %s is a kappa pcr run." % self.id)
else:
self.Log("Run %s is not a kappa pcr run." % self.id)
return is_patchpcr
def KappaPcrDemultiplex( self ):
"""
Calls bcl2fastq2 program to demultiplex the run.
"""
self.Log("Demultiplexing kappa PCR run %s." % self.id)
# Set minimum read length to the length of the bar code
# read carrying the random n-mer. This will always be the
# 3rd read.
read_lengths = self.ReadLengths()
min_read_length = read_lengths[2]
if self.CheckIfKappaPcr():
# Determine the use-bases-mask.
if len(read_lengths) == 4:
# This is a paired-end run with two index reads.
use_bases_mask = "Y*,Y*,I*,Y*"
else:
# This is a single-end run with two index reads.
use_bases_mask = "Y*,Y*,I*"
child_dir=self.dirname
output_dir = os.path.join( self.dirname, "Unaligned" )
child_stdout=open(os.path.join(child_dir,"bcl2fastq.out"),'w')
child_stderr=open(os.path.join(child_dir,"bcl2fastq.err"),'w')
child_args=[os.path.join(params.bcl2fastq2_dir,"bcl2fastq"),
"--runfolder-dir",self.dirname,
"--output-dir",output_dir,
"--sample-sheet",self.sample_sheet,
"--barcode-mismatches","1",
"--use-bases-mask",use_bases_mask,
"--minimum-trimmed-read-length", `min_read_length`
]
p = subprocess.Popen(args=child_args,cwd=child_dir,stdout=child_stdout,stderr=child_stderr)
retval = p.wait()
child_stdout.close()
child_stderr.close()
return True
def KappaPcrPostProcessSample(self,project,sample):
"""
Postprocesses data files for one sample in a Kappa PCR run.
Takes the sequence from the read 2 file and appends it to
the read names in the read 1 and read 3 files (if present).
"""
self.Log("Post processing sample %s." % sample)
# Create input file names for reads R1, R2, and R3.
r1_in_file = os.path.join(self.dirname,"Unaligned",project,"%s_L001_R1_001.fastq.gz"%sample)
r2_in_file = os.path.join(self.dirname,"Unaligned",project,"%s_L001_R2_001.fastq.gz"%sample)
r3_in_file = os.path.join(self.dirname,"Unaligned",project,"%s_L001_R3_001.fastq.gz"%sample)
# Create output file names for R1 and R2.
gnomex_sample=sample.split("_")[0]
r1_out_file = os.path.join(self.dirname,"Unaligned","%s_%s_1_1.txt.gz" % ( gnomex_sample, self.id))
r2_out_file = os.path.join(self.dirname,"Unaligned","%s_%s_1_2.txt.gz" % ( gnomex_sample, self.id))
# Process read 1.
r1_ifs=fastq.Reader(r1_in_file)
r2_ifs=fastq.Reader(r2_in_file)
r1_ofs=fastq.Writer(r1_out_file)
for r1_read in r1_ifs:
nmer_read = r2_ifs.next()
r1_read.name += "-" + nmer_read.sequence
r1_ofs.write(r1_read)
r1_ifs.close()
r2_ifs.close()
r1_ofs.close()
self.datafiles.append(r1_out_file)
# If read 3 exists, process it.
if os.path.exists(r3_in_file):
# Process read 1.
r3_ifs=fastq.Reader(r3_in_file)
r2_ifs=fastq.Reader(r2_in_file)
r2_ofs=fastq.Writer(r2_out_file)
for r3_read in r3_ifs:
nmer_read = r2_ifs.next()
r3_read.name += "-" + nmer_read.sequence
r2_ofs.write(r3_read)
r3_ifs.close()
r2_ifs.close()
r2_ofs.close()
self.datafiles.append(r2_out_file)
def KappaPcrPostprocess( self ):
"""
Postprocessing for kappa PCR runs. Takes the random n-mer, which
is produced by the second barcode read file, and should be in
the file with the _2 suffix, and appends the random n-mer sequence
to the read names of the data read(s).
"""
# Get projects and samples for this run from the sample sheet.
projects=self.GetSampleSheetProjectsSamples()
# For each project...
for project in projects.keys():
# For each sample...
for sample in projects[project]:
self.KappaPcrPostProcessSample(project,sample)
# Generate the MD5 checksums.
return self.GenerateChecksums()
def main():
k_run = KappaPcrRun("bar","/foo/bar")
if __name__ == "__main__":
main()