-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathkfdrc_sentieon_gvcf_wf.cwl
183 lines (175 loc) · 8.68 KB
/
kfdrc_sentieon_gvcf_wf.cwl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
cwlVersion: v1.2
class: Workflow
id: kfdrc_sentieon_gvcf_wf
label: Kids First DRC Sentieon gVCF Workflow
doc: |
# Kids First Data Resource Center Sentieon gVCF Workflow
<p align="center">
<img src="./kids_first_logo.svg" alt="Kids First repository logo" width="660px" />
</p>
This workflow takes a BAM/CRAM file, runs VerifyBamID, then runs Sentieon
Haplotyper and CollectVCMetrics.
The input BAM/CRAM file can either be a BQSR-recalibrated file or a
pre-recalibration file with an accompanying recalibration table provided in the
recal_table input.
This pipeline was made possible thanks to significant software and support
contributions from Sentieon. For more information on our collaborators, check
out their website:
- Sentieon: https://www.sentieon.com/
## Relevant Softwares and Versions
- [Sentieon](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/): `202112.01`
### Outputs
gvcf: The germline variants calls in VCF format
gvcf_calling_metrics: Detail and summary metrics about the gVCF
verifybamid_output: If not provided by the user, the workflow will output verifybamid's selfSM file
### Tips for running:
1. For contamination input, either populate the `contamination` field or
provide the three contamination files: `contamination_sites_bed`,
`contamination_sites_mu`, and `contamination_sites_ud`. Failure to provide one
of these groups will result in a failed run.
1. Suggested reference inputs (available from the [Broad Resource Bundle](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/hg38/v0)):
- contamination_sites_bed: Homo_sapiens_assembly38.contam.bed
- contamination_sites_mu: Homo_sapiens_assembly38.contam.mu
- contamination_sites_ud: Homo_sapiens_assembly38.contam.UD
- dbsnp_vcf: Homo_sapiens_assembly38.dbsnp138.vcf
- reference_tar: Homo_sapiens_assembly38.tgz
- wgs_calling_interval_list: wgs_coverage_regions.hg38.interval_list
- wgs_evaluation_interval_list: wgs_evaluation_regions.hg38.interval_list
1. The input for the reference_tar must be a tar file containing the reference
fasta along with its indexes. The required indexes are
`[.64.ann,.64.amb,.64.bwt,.64.pac,.64.sa,.dict,.fai]` and are generated by bwa,
picard, and samtools. Additionally, an `.64.alt` index is recommended.
1. If you are making your own bwa indexes make sure to use the `-6` flag to
obtain the `.64` version of the indexes. Indexes that do not match this naming
schema will cause a failure in certain runner ecosystems.
1. Should you decide to create your own reference indexes and omit the ALT
index file from the reference, or if its naming structure mismatches the other
indexes, then your alignments will be equivalent to the results you would
obtain if you run BWA-MEM with the -j option.
requirements:
- class: ScatterFeatureRequirement
- class: MultipleInputFeatureRequirement
- class: SubworkflowFeatureRequirement
inputs:
sentieon_license: {type: 'string?', default: "10.5.64.221:8990", doc: "License server host and port"}
input_reads: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}, {pattern: '^.bai', required: false}, {pattern: '.crai',
required: false}, {pattern: '^.crai', required: false}], doc: "Input BAM/CRAM file"}
recal_table: {type: 'File?', doc: "Recalibration table from BQSR"}
output_basename: {type: 'string', doc: "String to use as the base for output filenames"}
reference_tar: {type: 'File', doc: "Tar file containing a reference fasta and, optionally, its complete set of associated indexes
(samtools, bwa, and picard)", "sbg:suggestedValue": {class: File, path: 5f4ffff4e4b0370371c05153, name: Homo_sapiens_assembly38.tgz}}
dbsnp_vcf: {type: 'File', doc: "dbSNP vcf file", "sbg:suggestedValue": {class: File, path: 6063901f357c3a53540ca84b, name: Homo_sapiens_assembly38.dbsnp138.vcf}}
dbsnp_idx: {type: 'File?', doc: "dbSNP vcf index file", "sbg:suggestedValue": {class: File, path: 6063901e357c3a53540ca834, name: Homo_sapiens_assembly38.dbsnp138.vcf.idx}}
contamination: {type: 'float?', doc: "Precalculated contamination value. Providing the value here will skip the run of VerifyBAMID
and use the provided value as ground truth."}
contamination_sites_bed: {type: 'File?', doc: ".Bed file for markers used in this analysis,format(chr\tpos-1\tpos\trefAllele\taltAllele)",
"sbg:suggestedValue": {class: File, path: 6063901e357c3a53540ca833, name: Homo_sapiens_assembly38.contam.bed}}
contamination_sites_mu: {type: 'File?', doc: ".mu matrix file of genotype matrix", "sbg:suggestedValue": {class: File, path: 60639017357c3a53540ca7cd,
name: Homo_sapiens_assembly38.contam.mu}}
contamination_sites_ud: {type: 'File?', doc: ".UD matrix file from SVD result of genotype matrix", "sbg:suggestedValue": {class: File,
path: 6063901f357c3a53540ca84f, name: Homo_sapiens_assembly38.contam.UD}}
wgs_evaluation_interval_list: {type: 'File', doc: "Target intervals to restrict gvcf metric analysis (for VariantCallingMetrics)",
"sbg:suggestedValue": {class: File, path: 60639017357c3a53540ca7d3, name: wgs_evaluation_regions.hg38.interval_list}}
conditional: {type: 'boolean?', doc: "Hook to enable/disable this workflow when nested in another workflow."}
run_sex_metrics: {type: 'boolean?', doc: "idxstats will be collected and X/Y ratios calculated"}
outputs:
gvcf: {type: File, outputSource: sentieon_haplotyper/output}
gvcf_calling_metrics: {type: 'File[]', outputSource: picard_collectgvcfcallingmetrics/output}
verifybamid_output: {type: 'File?', outputSource: verifybamid_checkcontam_conditional/output}
idxstats: {type: 'File?', outputSource: samtools_idxstats_xy_ratio/output, doc: "samtools idxstats of the realigned BAM file."}
xy_ratio: {type: 'File?', outputSource: samtools_idxstats_xy_ratio/ratio, doc: "Text file containing X and Y reads statistics generated
from idxstats."}
steps:
index_dbsnp:
run: ../tools/gatk_indexfeaturefile.cwl
in:
input_file: dbsnp_vcf
input_index: dbsnp_idx
out: [output]
untar_reference:
run: ../tools/untar_indexed_reference_2.cwl
in:
reference_tar: reference_tar
out: [indexed_fasta, dict]
sentieon_readwriter_cram_to_bam:
run: ../tools/sentieon_ReadWriter.cwl
when: |
$(inputs.enable_tool && inputs.input_bam.nameext != '.bam')
in:
sentieon_license: sentieon_license
reference: untar_reference/indexed_fasta
input_bam:
source: input_reads
valueFrom: |
$(self ? [self] : self)
output_file_name:
source: input_reads
valueFrom: $(self.nameroot).bam
enable_tool: run_sex_metrics
out: [output_reads]
samtools_idxstats_xy_ratio:
when: $(inputs.run_idxstats)
run: ../tools/samtools_idxstats_xy_ratio.cwl
in:
run_idxstats: run_sex_metrics
input_bam:
source: [sentieon_readwriter_cram_to_bam/output_reads, input_reads]
pickValue: first_non_null
out: [output, ratio]
verifybamid_checkcontam_conditional:
run: ../tools/verifybamid_contamination_conditional.cwl
in:
contamination_sites_bed: contamination_sites_bed
contamination_sites_mu: contamination_sites_mu
contamination_sites_ud: contamination_sites_ud
precalculated_contamination: contamination
input_bam: input_reads
ref_fasta: untar_reference/indexed_fasta
output_basename: output_basename
out: [output, contamination]
sentieon_haplotyper:
run: ../tools/sentieon_haplotyper.cwl
in:
sentieon_license: sentieon_license
output_filename:
source: output_basename
valueFrom: $(self).g.vcf.gz
indexed_reference_fasta: untar_reference/indexed_fasta
input_reads:
source: input_reads
valueFrom: |
$(self ? [self] : self)
qual_cal:
source: recal_table
valueFrom: |
$(self ? [self] : self)
dbsnp: index_dbsnp/output
emit_mode:
source: conditional
valueFrom: "gvcf"
out: [output, recalibrated_reads]
picard_collectgvcfcallingmetrics:
run: ../tools/picard_collectgvcfcallingmetrics.cwl
in:
dbsnp_vcf: index_dbsnp/output
final_gvcf_base_name: output_basename
input_vcf: sentieon_haplotyper/output
reference_dict: untar_reference/dict
wgs_evaluation_interval_list: wgs_evaluation_interval_list
out: [output]
$namespaces:
sbg: https://sevenbridges.com
hints:
- class: 'sbg:maxNumberOfParallelInstances'
value: 2
"sbg:license": Apache License 2.0
"sbg:publisher": KFDRC
"sbg:categories":
- BAM
- CRAM
- GVCF
- SENTIEON
- WGS
"sbg:links":
- id: 'https://github.com/kids-first/kf-alignment-workflow/releases/tag/v2.9.1'
label: github-release