-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhackathon2nday.nf
125 lines (97 loc) · 3.6 KB
/
hackathon2nday.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* GATK4 Variant Calling Pipeline
* Usage: nextflow run gencorefacility/variant-calling-pipeline-gatk4 -with-docker gencorefacility/variant-calling-pipeline-gatk4
*
* Author: Mohammed Khalfan < [email protected] >
* NYU Center for Genetics and System Biology 2020
*/
// Setting some defaults here,
// can be overridden in config or via command line
params.reads="/home/ec2-user/environment/data/ggal/*_{1,2}.*fq"
params.ref="/home/ec2-user/environment/data/ggal/transcriptome.fa"
params.outdir="/home/ec2-user/environment/result_alignment"
params.out = "${params.outdir}/out"
params.tmpdir = "${params.outdir}/gatk_temp"
//params.snpeff_data = "${params.outdir}/snpeff_data"
// Print some stuff here
println "reads: $params.reads"
println "ref: $params.ref"
println "output: $params.out"
//println "gatk temp dir: $params.tmpdir"
//println "snpeff db: $params.snpeff_db"
//println "snpeff data: $params.snpeff_data"
// Setup the reference channel
/*
Channel
.fromPath(params.ref)
.ifEmpty { error "Cannot find any reads matching: ${params.ref}" }
.tap { ref_ch }
*/
/* Prepare the fastq read pairs for input.
* While doing this, count number of input samples
*/
num_samples = 0
Channel
.fromFilePairs( params.reads )
.ifEmpty { error "Cannot find any reads matching: ${params.reads}" }
.tap { read_pairs_ch }
.subscribe({ num_samples += 1 })
//read_pairs_ch.view()
process index {
container 'gencorefacility/variant-calling-pipeline-gatk4'
input:
path transcriptome from params.ref
output:
//path 'index' into index_ch
file("*") into index_ch
script:
"""
bwa index ${transcriptome}
"""
}
//index_ch.view()
process align {
cpus 1
publishDir "${params.out}/aligned_reads", mode:'copy' // tells the output of the chanel it is created automatically by nextflow
container 'gencorefacility/variant-calling-pipeline-gatk4'
input:
set pair_id, file(reads) from read_pairs_ch //the chanel creates the pair ids
file(index) from index_ch
path(ref) from params.ref
output:
set val(pair_id), file("${pair_id}_aligned_reads.sam") \
into aligned_reads_ch
script:
"""
bwa mem ${ref} ${reads} > ${pair_id}_aligned_reads.sam
"""
}
//aligned_reads_ch.view()
process sam_to_sorted_index_bam{
publishDir "${params.out}/sort_indexed_bam", mode:'copy'
container 'gencorefacility/variant-calling-pipeline-gatk4'
input:
set val(pair_id), file(aligned_reads) from aligned_reads_ch
output:
set val(pair_id), file("${pair_id}_aligned_reads_sorted.bam"), file("${pair_id}_aligned_reads_sorted.bam.bai") into bam_for_sorting_bam_ch
script:
"""
samtools view -S -b ${aligned_reads} > ${pair_id}_aligned_reads.bam
samtools sort ${pair_id}_aligned_reads.bam -o ${pair_id}_aligned_reads_sorted.bam
samtools index ${pair_id}_aligned_reads_sorted.bam
"""
}
//bam_for_sorting_bam_ch.view()
process markDuplicatesSpark {
publishDir "${params.out}/dedup_sorted", mode:'copy'
container 'broadinstitute/gatk:4.1.3.0'
input:
set val(pair_id), path(bamfile), path(baifile) from bam_for_sorting_bam_ch
output:
set val(pair_id), val(1), path("${pair_id}_sorted_dedup.bam") into bam_for_variant_calling_ch, sorted_dedup_ch_for_metrics, bam_for_bqsr
set val(pair_id), path("${pair_id}_dedup_metrics.txt") into dedup_qc_ch
script:
"""
mkdir tmp
gatk --java-options "-Djava.io.tmpdir=tmp" MarkDuplicatesSpark -I ${bamfile} -M ${pair_id}_dedup_metrics.txt -O ${pair_id}_sorted_dedup.bam --remove-sequencing-duplicates
"""
}