kids-first · migbro · Oct 21, 2024 · Apr 27, 2023 · Oct 16, 2024 · Oct 17, 2024
diff --git a/docs/KFDRC_GATK_HC_MOD_PLOIDY_README.md b/docs/KFDRC_GATK_HC_MOD_PLOIDY_README.md
@@ -0,0 +1,29 @@
+# Kids First DRC GATK HaplotypeCaller Modified Ploidy BETA Workflow
+This is a research workflow for users wishing to modify the ploidy of certain
+regions of their existing GVCF calls.
+
+## Inputs
+
+- input_cram: Input CRAM file
+- input_gvcf: GVCF generated in standard workflow
+- biospecimen_name: String name of biospcimen
+- output_basename: String to use as the base for output filenames
+- reference_fasta: FASTA file that was used during alignment. Also need
+  corresponding `.fai` and `.dict` files.
+- region: Specific region to pull, in format 'chr21' or 'chr3:1-1000'
+- dbsnp_vcf: dbSNP vcf file
+- dbsnp_idx: dbSNP vcf index file
+- contamination: Precalculated contamination value. Providing the value here
+  will skip the run of VerifyBAMID and use the provided value as ground truth.
+- contamination_sites_bed: .Bed file for markers used in this
+  analysis,format(chr\tpos-1\tpos\trefAllele\taltAllele)
+- contamination_sites_mu: .mu matrix file of genotype matrix
+- contamination_sites_ud: .UD matrix file from SVD result of genotype matrix
+- re_calling_interval_list: Interval list to re-call
+- wgs_evaluation_interval_list: Target intervals to restrict GVCF metric
+  analysis (for VariantCallingMetrics)
+- sample_ploidy: If sample/interval is expected to not have ploidy=2, enter expected ploidy
+
+## Outputs
+
+- mixed_ploidy_gvcf: Updated complete GVCF in which the desired region has had its ploidy updated
diff --git a/docs/dockers_gatk_cramtogvcf.md b/docs/dockers_gatk_cramtogvcf.md
@@ -7,7 +7,7 @@ gatk_indexfeaturefile.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.1.7.0R
 picard_collectgvcfcallingmetrics.cwl|pgc-images.sbgenomics.com/d3b-bixu/picard:2.18.9R
 picard_intervallisttools.cwl|pgc-images.sbgenomics.com/d3b-bixu/picard:2.18.9R
 picard_mergevcfs_python_renamesample.cwl|pgc-images.sbgenomics.com/d3b-bixu/picard:2.18.9R
-samtools_cram_to_bam.cwl|pgc-images.sbgenomics.com/d3b-bixu/samtools:1.8-dev
+samtools_cram_to_bam.cwl|pgc-images.sbgenomics.com/d3b-bixu/samtools:1.9
 samtools_idxstats_xy_ratio.cwl|pgc-images.sbgenomics.com/d3b-bixu/samtools:1.9
 untar_indexed_reference_2.cwl|None
 verifybamid_contamination_conditional.cwl|pgc-images.sbgenomics.com/d3b-bixu/verifybamid:1.0.2
diff --git a/subworkflows/kfdrc_bam_to_gvcf.cwl b/subworkflows/kfdrc_bam_to_gvcf.cwl
@@ -12,15 +12,16 @@ inputs:
   contamination_sites_bed: File
   contamination_sites_mu: File
   contamination_sites_ud: File
-  input_bam: File
-  indexed_reference_fasta: File
+  input_bam: { type: File, secondaryFiles: ['^.bai'] }
+  indexed_reference_fasta: { type: File, secondaryFiles: ['.fai', '^.dict'] }
   output_basename: string
   wgs_calling_interval_list: File
   dbsnp_vcf: File
   dbsnp_idx: File?
   reference_dict: File
   wgs_evaluation_interval_list: File
   conditional_run: int
+  sample_ploidy: { type: 'int?', doc: "If sample/interval is expected to not have ploidy=2, enter expected ploidy" }
 
 outputs:
   verifybamid_output: {type: File, outputSource: verifybamid_checkcontam_conditional/output}
@@ -63,6 +64,7 @@ steps:
       input_bam: input_bam
       interval_list: picard_intervallisttools/output
       reference: indexed_reference_fasta
+      sample_ploidy: sample_ploidy
     scatter: [interval_list]
     out: [output]
 

diff --git a/tools/bedtools_intersect.cwl b/tools/bedtools_intersect.cwl
@@ -0,0 +1,36 @@
+cwlVersion: v1.0
+class: CommandLineTool
+id: bedtools_intersect
+doc: "Subset VCF with bedtools intersect. Can add -v flag for negative selection"
+requirements:
+  - class: ShellCommandRequirement
+  - class: InlineJavascriptRequirement
+  - class: ResourceRequirement
+    ramMin: 8000
+    coresMin: 4
+  - class: DockerRequirement
+    dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/vcfutils:latest'
+
+baseCommand: [bedtools, intersect]
+arguments:
+  - position: 2
+    shellQuote: false
+    valueFrom: >-
+      -wa -header | bgzip -c -@ 4 > $(inputs.output_basename).bed_intersect.vcf.gz
+      && tabix $(inputs.output_basename).bed_intersect.vcf.gz
+
+inputs:
+    input_vcf: { type: File, secondaryFiles: ['.tbi'], doc: "Input VCF file.",
+      inputBinding: { position: 1, prefix: "-a" } }
+    input_bed_file: { type: File, doc: "bed intervals to intersect with.",
+      inputBinding: { position: 1, prefix: "-b" } }
+    output_basename: string
+    inverse: {type: 'boolean?', doc: "Select whatever is NOT in the interval bed file",
+      inputBinding: { position: 1, prefix: "-v"} }
+
+outputs:
+  intersected_vcf:
+    type: File
+    outputBinding:
+      glob: '*.bed_intersect.vcf.gz'
+    secondaryFiles: ['.tbi']
diff --git a/tools/gatk_haplotypecaller.cwl b/tools/gatk_haplotypecaller.cwl
@@ -40,5 +40,7 @@ inputs:
   input_bam: { type: File, secondaryFiles: [^.bai], doc: "Input bam file" }
   interval_list: { type: File, doc: "File containing one or more genomic intervals over which to operate" }
   contamination: { type: float, doc: "Fraction of contamination in sequencing data (for all samples) to aggressively remove" }
+  sample_ploidy: { type: 'int?', doc: "If sample/interval is expected to not have ploidy=2, enter expected ploidy",
+    inputBinding: {position: 1, prefix: "--sample_ploidy" } }
 outputs:
   output: { type: File, outputBinding: { glob: '*.vcf.gz' }, secondaryFiles: [.tbi] }
diff --git a/tools/gatk_intervallist_to_bed.cwl b/tools/gatk_intervallist_to_bed.cwl
@@ -0,0 +1,26 @@
+cwlVersion: v1.0
+class: CommandLineTool
+id: gatk4_intervallist2bed
+requirements:
+  - class: ShellCommandRequirement
+  - class: InlineJavascriptRequirement
+  - class: DockerRequirement
+    dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/gatk:4.1.1.0'
+  - class: ResourceRequirement
+    ramMin: 2000
+
+baseCommand: [/gatk, IntervalListToBed, --java-options, -Xmx100m]
+arguments:
+  - position: 1
+    shellQuote: false
+    valueFrom: >-
+      -O $(inputs.interval_list.nameroot).bed
+
+inputs:
+  interval_list: { type: File, doc: "Interval list to convert",
+    inputBinding: { position: 0, prefix: "-I"} }
+outputs:
+  output:
+    type: File
+    outputBinding:
+      glob: '*.bed'
diff --git a/tools/samtools_cram_to_bam.cwl b/tools/samtools_cram_to_bam.cwl
@@ -8,19 +8,24 @@ requirements:
     ramMin: 16000
     coresMin: $(inputs.threads)
   - class: DockerRequirement
-    dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/samtools:1.8-dev'
-baseCommand: [samtools, view]
+    dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/samtools:1.9'
+baseCommand: [samtools, view, -b]
 arguments:
-  - position: 1
+  - position: 3
     shellQuote: false
     valueFrom: >-
-      -b -T $(inputs.reference.path) -@ $(inputs.threads) $(inputs.input_cram.path) > $(inputs.output_basename).bam
+      > $(inputs.output_basename).bam
       && samtools index -@ $(inputs.threads) $(inputs.output_basename).bam $(inputs.output_basename).bai
 inputs:
-  input_cram: File
+  input_cram: { type: File, secondaryFiles: ['.crai?'], doc: "cram file to convert",
+    inputBinding: { position: 2 } }
+  region: { type: 'string?', doc: "Specific region to pull, in format 'chr21' or 'chr3:1-1000'",
+    inputBinding: { position: 2 } }
   output_basename: string
-  threads: { type: 'int?', doc: "num threads to use", default: 8}
-  reference: {type: File, secondaryFiles: [.fai]}
+  threads: { type: 'int?', doc: "num threads to use", default: 8,
+    inputBinding: { position: 1, prefix: "-@"} }
+  reference: {type: File, secondaryFiles: [.fai], doc: "reference file use for cram",
+    inputBinding: { position: 1, prefix: "--reference" } }
 outputs:
   output:
     type: File

diff --git a/workflows/kfdrc-gatk-haplotypecaller-ploidy-mod-wf.cwl b/workflows/kfdrc-gatk-haplotypecaller-ploidy-mod-wf.cwl
@@ -0,0 +1,132 @@
+cwlVersion: v1.2
+class: Workflow
+id: kfdrc-gatk-haplotypecaller-ploidy-mod-workflow
+label: Kids First DRC GATK HaplotypeCaller Modified Ploidy BETA Workflow
+doc: |
+  # Kids First DRC GATK HaplotypeCaller Modified Ploidy BETA Workflow
+  This is a research workflow for users wishing to modify the ploidy of certain
+  regions of their existing GVCF calls.
+
+  ## Inputs
+
+  - input_cram: Input CRAM file
+  - input_gvcf: GVCF generated in standard workflow
+  - biospecimen_name: String name of biospcimen
+  - output_basename: String to use as the base for output filenames
+  - reference_fasta: FASTA file that was used during alignment. Also need
+    corresponding `.fai` and `.dict` files.
+  - region: Specific region to pull, in format 'chr21' or 'chr3:1-1000'
+  - dbsnp_vcf: dbSNP vcf file
+  - dbsnp_idx: dbSNP vcf index file
+  - contamination: Precalculated contamination value. Providing the value here
+    will skip the run of VerifyBAMID and use the provided value as ground truth.
+  - contamination_sites_bed: .Bed file for markers used in this
+    analysis,format(chr\tpos-1\tpos\trefAllele\taltAllele)
+  - contamination_sites_mu: .mu matrix file of genotype matrix
+  - contamination_sites_ud: .UD matrix file from SVD result of genotype matrix
+  - re_calling_interval_list: Interval list to re-call
+  - wgs_evaluation_interval_list: Target intervals to restrict GVCF metric
+    analysis (for VariantCallingMetrics)
+  - sample_ploidy: If sample/interval is expected to not have ploidy=2, enter expected ploidy
+
+  ## Outputs
+
+  - mixed_ploidy_gvcf: Updated complete GVCF in which the desired region has had its ploidy updated
+
+
+requirements:
+- class: ScatterFeatureRequirement
+- class: MultipleInputFeatureRequirement
+- class: SubworkflowFeatureRequirement
+- class: InlineJavascriptRequirement
+- class: StepInputExpressionRequirement
+inputs:
+  input_cram: {type: 'File', doc: "Input CRAM file"}
+  input_gvcf: {type: File, secondaryFiles: ['.tbi'], doc: "gVCF generated in standard workflow"}
+  biospecimen_name: {type: 'string', doc: "String name of biospecimen"}
+  output_basename: {type: 'string', doc: "String to use as the base for output filenames"}
+  reference_fasta: {type: 'File', "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta,
+      secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai}, {class: File, path: 60639019357c3a53540ca7e7,
+          name: Homo_sapiens_assembly38.dict}]}, secondaryFiles: ['.fai', '^.dict']}
+  region: {type: 'string?', doc: "Specific region to pull, in format 'chr21' or 'chr3:1-1000'"}
+  dbsnp_vcf: {type: 'File', doc: "dbSNP vcf file", "sbg:suggestedValue": {class: File, path: 6063901f357c3a53540ca84b, name: Homo_sapiens_assembly38.dbsnp138.vcf}}
+  dbsnp_idx: {type: 'File?', doc: "dbSNP vcf index file", "sbg:suggestedValue": {class: File, path: 6063901e357c3a53540ca834, name: Homo_sapiens_assembly38.dbsnp138.vcf.idx}}
+  contamination: {type: 'float?', doc: "Precalculated contamination value. Providing the value here will skip the run of VerifyBAMID
+      and use the provided value as ground truth."}
+  contamination_sites_bed: {type: 'File?', doc: ".Bed file for markers used in this analysis,format(chr\tpos-1\tpos\trefAllele\taltAllele)",
+    "sbg:suggestedValue": {class: File, path: 6063901e357c3a53540ca833, name: Homo_sapiens_assembly38.contam.bed}}
+  contamination_sites_mu: {type: 'File?', doc: ".mu matrix file of genotype matrix", "sbg:suggestedValue": {class: File, path: 60639017357c3a53540ca7cd,
+      name: Homo_sapiens_assembly38.contam.mu}}
+  contamination_sites_ud: {type: 'File?', doc: ".UD matrix file from SVD result of genotype matrix", "sbg:suggestedValue": {class: File,
+      path: 6063901f357c3a53540ca84f, name: Homo_sapiens_assembly38.contam.UD}}
+  re_calling_interval_list: {type: 'File', doc: "Interval list to re-call"}
+  wgs_evaluation_interval_list: {type: 'File', doc: "Target intervals to restrict gvcf metric analysis (for VariantCallingMetrics)",
+    "sbg:suggestedValue": {class: File, path: 60639017357c3a53540ca7d3, name: wgs_evaluation_regions.hg38.interval_list}}
+  sample_ploidy: {type: 'int?', doc: "If sample/interval is expected to not have ploidy=2, enter expected ploidy"}
+
+outputs:
+  mixed_ploidy_gvcf: {type: File, outputSource: picard_mergevcfs_python_renamesample/output}
+
+steps:
+  gatk_intervallist_to_bed:
+    run: ../tools/gatk_intervallist_to_bed.cwl
+    in:
+      interval_list: re_calling_interval_list
+    out: [output]
+  bedtools_intersect:
+    run: ../tools/bedtools_intersect.cwl
+    in:
+      input_vcf: input_gvcf
+      input_bed_file: gatk_intervallist_to_bed/output
+      output_basename: output_basename
+      inverse:
+        valueFrom: ${ return 1==1 }
+    out: [intersected_vcf]
+  samtools_cram_to_bam:
+    run: ../tools/samtools_cram_to_bam.cwl
+    in:
+      input_cram: input_cram
+      output_basename: output_basename
+      reference: reference_fasta
+      region: region
+      threads:
+        valueFrom: ${ return 16 }
+    out: [output]
+  generate_gvcf:
+    run: ../subworkflows/kfdrc_bam_to_gvcf.cwl
+    in:
+      contamination_sites_ud: contamination_sites_ud
+      contamination_sites_mu: contamination_sites_mu
+      contamination_sites_bed: contamination_sites_bed
+      input_bam: samtools_cram_to_bam/output
+      indexed_reference_fasta: reference_fasta
+      output_basename: output_basename
+      dbsnp_vcf: dbsnp_vcf
+      dbsnp_idx: dbsnp_idx
+      reference_dict:
+        source: reference_fasta
+        valueFrom: "${self.secondaryFiles.filter(function(e) {return e.nameext == '.dict'})[0])}"
+      wgs_calling_interval_list: re_calling_interval_list
+      wgs_evaluation_interval_list: wgs_evaluation_interval_list
+      conditional_run:
+        valueFrom: $(1)
+      contamination: contamination
+      biospecimen_name: biospecimen_name
+      sample_ploidy: sample_ploidy
+    out: [verifybamid_output, gvcf, gvcf_calling_metrics]
+  picard_mergevcfs_python_renamesample:
+    run: ../tools/picard_mergevcfs_python_renamesample.cwl
+    in:
+      input_vcf:
+        source: [bedtools_intersect/intersected_vcf, generate_gvcf/gvcf]
+      output_vcf_basename: output_basename
+      biospecimen_name: biospecimen_name
+    out: [output]
+
+$namespaces:
+  sbg: https://sevenbridges.com
+hints:
+- class: 'sbg:maxNumberOfParallelInstances'
+  value: 4
+sbg:license: Apache License 2.0
+sbg:publisher: KFDRC