Merge pull request #2 from Sentieon/dev

Add a DNAscope short-read pipeline
Sentieon · May 30, 2024 · 94fb6b3 · 94fb6b3
2 parents 61afcff + 3fdb3ed
commit 94fb6b3
Showing 13 changed files with 1,444 additions and 69 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -5,6 +5,12 @@ on:
       - main
   pull_request:
 
+env:
+  SENTIEON_LICENSE: ${{ secrets.SENTIEON_LICENSE }}
+  SENTIEON_AUTH_MECH: "GitHub Actions - token"
+  ENCRYPTION_KEY: ${{ secrets.ENCRYPTION_KEY }}
+  LICENSE_MESSAGE: ${{ secrets.LICENSE_MESSAGE }}
+
 jobs:
   smoke:
     strategy:
@@ -45,23 +51,56 @@ jobs:
       - name: Install sentieon
         run: |
           curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf -
+          echo "SENTIEON_VERSION=$SENTIEON_VERSION" >> $GITHUB_ENV
+          echo "PATH=$(pwd)/sentieon-genomics-$SENTIEON_VERSION/bin:$PATH" >> $GITHUB_ENV
         env:
           SENTIEON_VERSION: ${{ matrix.sentieon-version }}
-      - name: Download model
+      - name: Download long-read model
         run: |
           curl -L "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.1.bundle" \
             > "DNAscope PacBio2.1.bundle"
-      - name: Smoke test
+      - name: Download short-read model
+        run: |
+          curl -L "https://s3.amazonaws.com/sentieon-release/other/DNAscopeIlluminaWGS2.0.bundle" \
+            > "DNAscope IlluminaWGS2.0.bundle"
+      - name: set SENTIEON_AUTH_DATA
         run: |
           SENTIEON_AUTH_DATA=$(python3 .github/scripts/license_message.py encrypt --key "$ENCRYPTION_KEY" --message "$LICENSE_MESSAGE")
-          export SENTIEON_AUTH_DATA
-          . .venv/bin/activate
-          export PATH=$(pwd)/sentieon-genomics-$SENTIEON_VERSION/bin:$PATH
+          echo "SENTIEON_AUTH_DATA=$SENTIEON_AUTH_DATA" >> $GITHUB_ENV
+      - name: Test spaces
+        run: |
           gzip -dc tests/smoke/ref.fa.bgz > "tests/smoke/r ef.fa"
-          mv tests/smoke/ref.fa.fai "tests/smoke/r ef.fa.fai"
-          mv tests/smoke/sample.cram "tests/smoke/sam ple.cram"
-          mv tests/smoke/sample.cram.crai "tests/smoke/sam ple.cram.crai"
-
+          cp tests/smoke/ref.fa.fai "tests/smoke/r ef.fa.fai"
+          cp tests/smoke/sample.cram "tests/smoke/sam ple.cram"
+          cp tests/smoke/sample.cram.crai "tests/smoke/sam ple.cram.crai"
+          sentieon bwa index "tests/smoke/r ef.fa"
+          echo -e "chr20\t0\t2500" > "tests/smoke/r ef.bed"
+      - name: Smoke test - short-read
+        run: |
+          . .venv/bin/activate
+          sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \
+            --duplicate-marking rmdup --consensus --align \
+            --input_ref "tests/smoke/r ef.fa" -i "tests/smoke/illumina.cram" \
+            -m "DNAscope IlluminaWGS2.0.bundle" "output_sr.vcf.gz"
+          if [ ! -f "output_sr.vcf.gz" -o ! -f "output_sr.g.vcf.gz" ]; then
+            exit 1
+          fi
+      - name: Smoke test - short-read fastq
+        run: |
+          . .venv/bin/activate
+          samtools fastq --reference "tests/smoke/r ef.fa" -1 sr_r1.fastq.gz \
+            -2 sr_r2.fastq.gz "tests/smoke/illumina.cram" > /dev/null
+          sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \
+          --r1-fastq sr_r1.fastq.gz --r2-fastq sr_r2.fastq.gz \
+            --readgroups "@RG\tID:HG002-1\tSM:HG002" \
+            -m "DNAscope IlluminaWGS2.0.bundle" --assay WES \
+            --bed "tests/smoke/r ef.bed" "output_sr2.vcf.gz"
+          if [ ! -f "output_sr2.vcf.gz" ]; then
+            exit 1
+          fi
+      - name: Smoke test - long-read
+        run: |
+          . .venv/bin/activate
           sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
             -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
             --repeat-model tests/smoke/sample_repeat.model -g \
@@ -73,7 +112,9 @@ jobs:
           if [ ! -f "output hifi.sv.vcf.gz" -o ! -f "output hifi.haploid.vcf.gz" ]; then
             exit 1
           fi
-
+      - name: Smoke test - gVCF
+        run: |
+          . .venv/bin/activate
           sentieon-cli dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \
             -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
             --repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz"
@@ -82,15 +123,19 @@ jobs:
           if [ ! -f "output ont.sv.vcf.gz" ]; then
             exit 1
           fi
-
+      - name: Smoke test - realignment
+        run: |
+          . .venv/bin/activate
           sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
             -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
             --repeat-model tests/smoke/sample_repeat.model --align \
             --input_ref "tests/smoke/r ef.fa" "output realigned.vcf.gz"
           if [ ! -f "output realigned.vcf.gz" -o ! -f "output realigned_mm2_sorted_0.cram" ]; then
             exit 1
           fi
-
+      - name: Smoke test - fastq
+        run: |
+          . .venv/bin/activate
           samtools fastq --reference "tests/smoke/r ef.fa" \
             "tests/smoke/sam ple.cram" | \
             gzip -c > sample.fq.gz
@@ -101,11 +146,4 @@ jobs:
           if [ ! -f "output fq.vcf.gz" -o ! -f "output fq_mm2_sorted_fq_0.cram" -o ! -f "output fq.sv.vcf.gz" ]; then
             exit 1
           fi
-
-        env:
-          SENTIEON_LICENSE: ${{ secrets.SENTIEON_LICENSE }}
-          SENTIEON_AUTH_MECH: "GitHub Actions - token"
-          ENCRYPTION_KEY: ${{ secrets.ENCRYPTION_KEY }}
-          LICENSE_MESSAGE: ${{ secrets.LICENSE_MESSAGE }}
-          SENTIEON_VERSION: ${{ matrix.sentieon-version }}
       
diff --git a/docs/dnascope-longread.md b/docs/dnascope-longread.md
@@ -2,85 +2,137 @@
 
 Sentieon DNAscope LongRead is a pipeline for alignment and germline variant calling (SNVs, SVs, and indels) from long-read sequence data. The DNAscope LongRead pipeline is able to take advantage of longer read lengths to perform quick and accurate variant calling using specially calibrated machine learning models.
 
-This folder provides a shell script implementing the DNAscope LongRead pipeline. The pipeline will accept as input aligned reads in BAM or CRAM format, or un-aligned data in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats.
+The pipeline will accept as input aligned reads in BAM or CRAM format, or un-aligned reads in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats.
 
 DNAscope LongRead is implemented using the Sentieon software package, which requires a valid license for use. Please contact info@sentieon.com for access to the Sentieon software and an evaluation license.
 
 ## Prerequisites
 
-In order to run the pipeline, you need to use the Sentieon software package version 202308.01 or higher and [Python] version >3.8. SNV and indel calling additionally requires [bcftools] version 1.10 or higher and [bedtools]. Alignment of read data in uBAM or uCRAM or re-alignment of read data additional requires [samtools] version 1.16 or higher. The `sentieon`, `python`, `bcftools`, `bedtools`, and `samtools` executables will be accessed through the user's `PATH` environment variable.
+- Sentieon software package version 202308.01 or higher.
+- [Python] version 3.8 or higher.
+- [bcftools] version 1.10 or higher for SNV and indel calling.
+- [bedtools] for SNV and indel calling.
+- [samtools] version 1.16 or higher for alignment of read in uBAM or uCRAM format or re-alignment of previously aligned reads.
+
+The `sentieon`, `python`, `bcftools`, `bedtools`, and `samtools` executables will be accessed through the user's `PATH` environment variable.
 
 ## Input data requirements
 
 ### The Reference genome
 
-DNAscope LongRead will call variants present in the sample relative to a high quality reference genome sequence. Besides the reference genome file, a samtools fasta index file (.fai) needs to be present. We recommend aligning to a reference genome without alternate contigs.
+DNAscope LongRead will call variants present in the sample relative to a high quality reference genome sequence. Besides the reference genome file, a samtools fasta index file (.fai) needs to be present.
 
+We recommend aligning to a reference genome without alternate contigs.
 
-## Usage
 
-A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the uBAM, uCRAM, BAM, or CRAM formats:
-```sh
-sentieon-cli dnascope-longread [-h] -r REFERENCE -i SAMPLE_INPUT \
-  -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g] \
-  --tech HiFi|ONT [--haploid-bed HAPLOID_BED] [--] sample.vcf.gz
-```
+## Usage
 
-With uBAM, uCRAM, BAM, or CRAM input, the read data can be re-aligned to the reference genome using minimap2 using the `--align` argument.
+### Alignment and variant calling from FASTQ
 
-A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the FASTQ format:
+A single command is run to align and call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the FASTQ format:
 ```sh
-sentieon-cli dnascope-longread [-h] -r REFERENCE --fastq INPUT_FASTQ  \
-  --readgroups READGROUP -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] \
-  [-t NUMBER_THREADS] [-g] --tech HiFi|ONT [--haploid-bed HAPLOID_BED] [--] \
+sentieon-cli dnascope-longread [-h] \
+  -r REFERENCE \
+  --fastq INPUT_FASTQ ... \
+  --readgroups READGROUP ... \
+  -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b DIPLOID_BED] \
+  [-t NUMBER_THREADS] \
+  [-g] \
+  --tech HiFi|ONT \
+  [--haploid-bed HAPLOID_BED] \
   sample.vcf.gz
 ```
 
-The Sentieon LongRead pipeline requires the following arguments:
-- `-r REFERENCE`: the location of the reference FASTA file. You should make sure that the reference is the same as the one used in the mapping stage. A reference fasta index, ".fai" file, is also required.
-- `-i SAMPLE_INPUT`: the input sample file in BAM or CRAM format. One or more files can be suppied by passing multiple files after the `-i` argument.
+With FASTQ input, the DNAscope LongRead pipeline requires the following arguments:
+- `-r REFERENCE`: the location of the reference FASTA file. A reference fasta index, ".fai" file, is also required.
 - `--fastq INPUT_FASTQ`: the input sample file in FASTQ format. One or more files can be supplied by passing multiple files after the `--fastq` argument.
-- `--readgroups READGROUP`: readgroup information for the read data. The `--readgroups` argument is required if the input data is in the fastq format. This argument expects complete readgroup strings and these strings will be passed to `minimap2` through the `-R` argument. An example argument is `--readgroups '@RG\tID:foo\tSM:bar'`.
+- `--readgroups READGROUP`: readgroup information for the read data. The `--readgroups` argument is required if the input data is in the FASTQ format. This argument expects complete readgroup strings and these strings will be passed to `minimap2` through the `-R` argument. An example argument is `--readgroups '@RG\tID:foo\tSM:bar'`.
 - `-m MODEL_BUNDLE`: the location of the model bundle. Model bundle files can be found in the [sentieon-models] repository.
 - `--tech HiFi|ONT`: Sequencing technology used to generate the reads. Supported arguments are `ONT` or `HiFi`.
+- `sample.vcf.gz`: the location of the output VCF file for SNVs and indels. The pipeline requires the output file end with the suffix, ".vcf.gz". The file path without the suffix will be used as the basename for other output files.
 
 The Sentieon LongRead pipeline accepts the following optional arguments:
 - `-d DBSNP`: the location of the Single Nucleotide Polymorphism database (dbSNP) used to label known variants in VCF (`.vcf`) or bgzip compressed VCF (`.vcf.gz`) format. Only one file is supported. Supplying this file will annotate variants with their dbSNP refSNP ID numbers. A VCF index file is required.
 - `-b DIPLOID_BED`: interval in the reference to restrict diploid variant calling, in BED file format. Supplying this file will limit diploid variant calling to the intervals inside the BED file.
 - `--haploid-bed HAPLOID_BED`: interval in the reference to restrict haploid variant calling, in BED file format. Supplying this file will perform haploid variant calling across the intervals inside the BED file.
 - `-t NUMBER_THREADS`: number of computing threads that will be used by the software to run parallel processes. The argument is optional; if omitted, the pipeline will use as many threads as the server has.
 - `-g`: output variants in the gVCF format, in addition to the VCF output file. The tool will output a bgzip compressed gVCF file with a corresponding index file.
-- `--align`: re-align the input read data to the reference genome using Sentieon minimap2. This argument needs to be passed to obtain useful output from uBAM or uCRAM input.
 - `-h`: print the command-line help and exit.
 - `--dry-run`: print the pipeline commands, but do not actually execute them.
 
-The Sentieon LongRead pipeline requires the following positional arguments:
-- `sample.vcf.gz`: the location and filename of the variant calling output. The filename must end with the suffix, `.vcf.gz`. The tool will output a bgzip compressed VCF file with a corresponding index file.
+### Alignment and variant calling from uBAM, uCRAM, BAM, or CRAM
+
+A single command is run to align and call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the uBAM, uCRAM, BAM, or CRAM formats:
+```sh
+sentieon-cli dnascope-longread [-h] \
+  -r REFERENCE \
+  -i SAMPLE_INPUT ... \
+  --align \
+  -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b DIPLOID_BED] \
+  [-t NUMBER_THREADS] \
+  [-g] \
+  --tech HiFi|ONT \
+  [--haploid-bed HAPLOID_BED] \
+  [--input_ref INPUT_REF] \
+  sample.vcf.gz
+```
+
+With uBAM, uCRAM, BAM, or CRAM input, the DNAscope LongRead pipeline requires the following new arguments:
+- `-i SAMPLE_INPUT`: the input input sample file in uBAM or uCRAM format. One or more files can be supplied by passing multiple files after the `-i` argument.
+- `--align`: re-align the input read data to the reference genome using Sentieon minimap2.
+
+The DNAscope LongRead pipeline accepts the following new optional arguments:
+- `--input_ref INPUT_REF`: a reference fasta used for decoding the input file(s). Required with uCRAM or CRAM input. Can be different from the fasta used with the `-r` argument.
+
+### Variant calling from BAM or CRAM
+
+A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the BAM, or CRAM formats:
+```sh
+sentieon-cli dnascope-longread [-h] \
+  -r REFERENCE \
+  -i SAMPLE_INPUT ... \
+  -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b DIPLOID_BED] \
+  [-t NUMBER_THREADS] \
+  [-g] \
+  --tech HiFi|ONT \
+  [--haploid-bed HAPLOID_BED] \
+  sample.vcf.gz
+```
+
+Not supplying the `--align` argument will direct the pipeline to call variants directly from the input reads.
 
 ## Pipeline output
 
-The Sentieon LongRead pipeline will output a bgzip compressed file (.vcf.gz) containing variant calls in the standard VCFv4.2 format along with a tabix index file (.vcf.gz.tbi). If the `-g` option is used, the pipeline will also output a bgzip compressed file (.g.vcf.gz) containing variant calls in the gVCF format along with a tabix index file (.g.vcf.gz.tbi).
+### List of output files
 
-With the `--haploid-bed HAPLOID_BED` argument, the pipeline will create the following output files:
-- `sample.vcf.gz`: SNV and indel variant calls across the diploid regions of the genome (as defined in the `-d DIPLOID_BED` file).
-- `sample.haploid.vcf.gz`: SNV and indel variant calls across the haploid regions of the genome (as defined in the `--haploid-bed HAPLOID_BED` file).
+The following files are output when processing FASTQ data or uBAM, uCRAM, BAM, or CRAM files with the `--align` argument:
+- `sample.vcf.gz`: SNV and indel variant calls across the regions of the genome as defined in the `-b DIPLOID_BED` file.
 - `sample.sv.vcf.gz`: structural variant calls from the Sentieon LongReadSV tool.
+- `sample_mm2_sorted_fq_*.cram`: aligned and coordinate-sorted reads from the input FASTQ files.
+- `sample_mm2_sorted_*.cram`: aligned and coordinate-sorted reads from the input uBAM, uCRAM, BAM, or CRAM files.
 
-With the `--align` argument or fastq input, the software will also output a CRAM or BAM file for each input file.
+With the `--haploid-bed HAPLOID_BED` argument, the pipeline will create the following additional output files:
+- `sample.haploid.vcf.gz`: SNV and indel variant calls across the haploid regions of the genome (as defined in the `--haploid-bed HAPLOID_BED` file).
 
 ## Other considerations
 
 ### Diploid and haploid variant calling
 
 The default pipeline is recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b DIPLOID_BED` option can be used to limit diploid variant calling to diploid chromosomes and the `--haploid-bed HAPLOID_BED` argument can be used to perform haploid variant calling across haploid chromosomes. Diploid and haploid variants will be output to separate VCF files.
 
-Example diploid and haploid BED files can be found in the [/data](/data) folder in this repository.
+Diploid and haploid BED files for the human hg38 reference genome (with male samples) can be found in the [/data](/data) folder in this repository.
 
 ### Modification
 
-Scripts in this folder are made available under the [BSD 2-Clause license](/LICENSE). Modification of the shell scripts in this folder is encouraged for users who wish to retain intermediate files generated by the pipeline, add additional processing steps, or modify command line arguments.
+Scripts in this repository are made available under the [BSD 2-Clause license](/LICENSE).
 
-The Python scripts in the `sentieon_cli/scripts`` folder perform low-level manipulation of intermediate gVCF and VCF files generated by the pipeline. Due to the low-level data handling performed by these scripts, modification of these files by users is discouraged.
+The Python scripts in the `sentieon_cli/scripts` folder perform low-level manipulation of intermediate gVCF and VCF files generated by the pipeline. Due to the low-level data handling performed by these scripts, modification of these files by users is discouraged.
 
 ## References
 **[Sentieon DNAscope LongRead – A highly Accurate, Fast, and Efficient Pipeline for Germline Variant Calling from PacBio HiFi reads]** - A preprint describing the DNAscope LongRead pipeline for calling variants from PacBio HiFi data.

diff --git a/docs/dnascope.md b/docs/dnascope.md
@@ -0,0 +1,167 @@
+# DNAscope
+
+Sentieon DNAscope is a pipeline for alignment and germline variant calling (SNVs, SVs and indels) from short-read DNA sequence data. The DNAscope pipeline uses a combination of traditional statistical approaches and machine learning to achieve high variant calling accuracy.
+
+The pipeline accepts as input aligned reads in BAM or CRAM format, or un-aligned reads in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats.
+
+DNAscope is implemented using the Sentieon software package, which requires a valid license for use. Please contact info@sentieon.com for access to the Sentieon software and an evaluation license.
+
+## Prerequisites
+
+- Sentieon software package version 202308 or higher.
+- [samtools] version 1.16 or higher for alignment of read in uBAM or uCRAM format or re-alignment of previously aligned reads.
+- [MultiQC] version 1.18 or higher for metrics report generation.
+
+The `sentieon`, `samtools`, and `multiqc` executables will be accessed through the user's `PATH` environment variable.
+
+## Input data requirements
+
+### The Reference genome
+
+DNAscope will call variants present in the sample relative to a high quality reference genome sequence. Besides the reference genome file, a samtools fasta index file (.fai) needs to be present. Read alignment also requires bwa index files.
+
+We recommend aligning to a reference genome without alternate contigs. If alternate contigs are present in the genome, please also supply a ".alt" file to activate [alt-aware alignment] in bwa.
+
+## Usage
+
+### Alignment and variant calling from FASTQ
+
+A single command is run to align, preprocess, and call SNVs, indels, and structural variants from FASTQ:
+```sh
+sentieon-cli dnascope [-h] \
+  -r REFERENCE \
+  --r1-fastq R1_FASTQ ... \
+  --r2-fastq R2_FASTQ ... \
+  --readgroups READGROUPS ... \
+  -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b BED] \
+  [--interval_padding INTERVAL_PADDING] \
+  [-t NUMBER_THREADS] \
+  [--pcr-free] \
+  [-g] \
+  [--duplicate-marking MARKDUP] \
+  [--assay ASSAY] \
+  [--consensus] \
+  [--dry-run] \
+  [--bam_format] \
+  sample.vcf.gz
+```
+
+With FASTQ input, the DNAscope pipeline requires the following arguments:
+- `-r REFERENCE`: the location of the reference FASTA file. A reference fasta index, ".fai" file, and bwa index files, are also required.
+- `--r1-fastq R1_FASTQ`: the R1 input FASTQ. Can be used multiple times. `--r1-fastq` files without a corresponding `--r2-fastq` are assumed to be single-ended.
+- `--r2-fastq R2_FASTQ`: the R2 input FASTQ. Can be used multiple times.
+- `--readgroups READGROUPS`: readgroup information for each FASTQ. The pipeline will expect the same number of arguments to `--r1-fastq` and `--readgroups`. An example argument is, `--readgroups "@RG\tID:HG002-1\tSM:HG002\tLB:HG002-LB-1\tPL:ILLUMINA"`
+- `-m MODEL_BUNDLE`: the location of the model bundle. Model bundle files can be found in the [sentieon-models] repository.
+- `sample.vcf.gz`: the location of the output VCF file for SNVs and indels. The pipeline requires the output file end with the suffix, ".vcf.gz". The file path without the suffix will be used as the basename for other output files.
+
+The DNAscope pipeline accepts the following optional arguments:
+- `-d DBSNP`: the location of the Single Nucleotide Polymorphism database (dbSNP) used to label known variants in VCF (`.vcf`) or bgzip compressed VCF (`.vcf.gz`) format. Only one file is supported. Supplying this file will annotate variants with their dbSNP refSNP ID numbers. A VCF index file is required.
+- `-b BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file.
+- `--interval_padding INTERVAL_PADDING`: adds INTERVAL_PADDING bases padding to the edges of the input intervals. The default value is 0.
+- `-t NUMBER_THREADS`: number of computing threads that will be used by the software to run parallel processes. The argument is optional; if omitted, the pipeline will use as many threads as the server has.
+- `--pcr-free`: Use variant calling settings appropriate for a PCR-free library prep.
+- `-g`: output variants in the gVCF format, in addition to the VCF output file. The tool will output a bgzip compressed gVCF file with a corresponding index file.
+- `--duplicate-marking MARKDUP`: setting for duplicate marking. `markdup` will mark duplicate reads. `rmdup` will remove duplicate reads. `none` will skip duplicate marking.
+- `--assay ASSAY`: assay setting for metrics collection `WGS` or `WES`.
+- `--consensus`: generate consensus reads during duplicate marking.
+- `-h`: print the command-line help and exit.
+- `--dry-run`: print the pipeline commands, but do not actually execute them.
+- `--bam_format`: use BAM format instead of CRAM for output aligned files.
+
+### Alignment and variant calling from uBAM or uCRAM
+A single command is run to align, preprocess, and call SNVs, indels, and structural variants from uBAM or uCRAM files:
+```sh
+sentieon-cli dnascope [-h] \
+  -r REFERENCE \
+  -i SAMPLE_INPUT ... \
+  --align \
+  [--input_ref INPUT_REF] \
+  -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b BED] \
+  [--interval_padding INTERVAL_PADDING] \
+  [-t NUMBER_THREADS] \
+  [--pcr-free] \
+  [-g] \
+  [--duplicate-marking MARKDUP] \
+  [--assay ASSAY] \
+  [--consensus] \
+  [--dry-run] \
+  [--bam_format] \
+  sample.vcf.gz
+```
+
+With uBAM or uCRAM input, the DNAscope pipeline requires the following new arguments:
+- `-i SAMPLE_INPUT`: the input input sample file in uBAM or uCRAM format. One or more files can be supplied by passing multiple files after the `-i` argument.
+- `--align`: directs the pipeline to align the input reads.
+
+The DNAscope pipeline accepts the following new optional arguments:
+- `--input_ref INPUT_REF`: a reference fasta used for decoding the input file(s). Required with uCRAM input. Can be different from the fasta used with the `-r` argument.
+
+### Alignment and variant calling from sorted BAM or CRAM
+A single command is run to align, preprocess, and call SNVs, indels, and structural variants from BAM or CRAM files:
+```sh
+sentieon-cli dnascope [-h] \
+  -r REFERENCE \
+  -i SAMPLE_INPUT ... \
+  --collate-align \
+  [--input_ref INPUT_REF] \
+    -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b BED] \
+  [--interval_padding INTERVAL_PADDING] \
+  [-t NUMBER_THREADS] \
+  [--pcr-free] \
+  [-g] \
+  [--duplicate-marking MARKDUP] \
+  [--assay ASSAY] \
+  [--consensus] \
+  [--dry-run] \
+  [--bam_format] \
+  sample.vcf.gz
+```
+
+With BAM or CRAM input, the DNAscope pipeline requires the following new arguments:
+- `--collate-align`: directs the pipeline to collate and then align the input reads.
+
+### Variant calling from sorted BAM or CRAM
+A single command is run to preprocess, and call SNVs, indels, and structural variants from BAM or CRAM files:
+```sh
+sentieon-cli dnascope [-h] \
+  -r REFERENCE \
+  -i SAMPLE_INPUT ... \
+  -m MODEL_BUNDLE \
+  [-d DBSNP] \
+  [-b BED] \
+  [--interval_padding INTERVAL_PADDING] \
+  [-t NUMBER_THREADS] \
+  [--pcr-free] \
+  [-g] \
+  [--duplicate-marking MARKDUP] \
+  [--assay ASSAY] \
+  [--consensus] \
+  [--dry-run] \
+  [--bam_format] \
+  sample.vcf.gz
+```
+
+Not supplying the `--align` and `--collate-align` arguments will direct the pipeline to call variants directly from the input reads.
+
+## Pipeline output
+
+### List of output files
+
+The following files are output when processing WGS FASTQ with default arguments:
+- `sample.vcf.gz`: SNV and indel variant calls across the regions of the genome as defined in the `-b BED` file.
+- `sample_deduped.cram`: aligned, coordinate-sorted and duplicate-marked read data from the input FASTQ files.
+- `sample_svs.vcf.gz`: structural variant calls from DNAscope and SVSolver.
+- `sample_metrics`: a directory containing QC metrics for the analyzed sample.
+  - `sample_metrics/coverage*`: coverage metrics for the processed sample. Only available for WGS samples. Replaced by HS metrics for WES samples.
+  - `sample_metrics/multiqc_report.html`: collected QC metrics aggregated by MultiQC.
+
+[samtools]: https://www.htslib.org/
+[MultiQC]: https://multiqc.info/
+[alt-aware alignment]: https://github.com/lh3/bwa/blob/master/README-alt.md
+[sentieon-models]: https://github.com/Sentieon/sentieon-models
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 #https://stackoverflow.com/questions/75408641/whats-difference-between-tool-poetry-and-project-in-pyproject-toml
 [tool.poetry]
 name = "sentieon_cli"
-version = "0.2.0"
+version = "0.3.0"
 description = "entry point for sentieon command-line tools"
 authors = ["Don Freed <don.freed@sentieon.com>", "Brent <bpederse@gmail.com>"]
 readme = "README.md"

diff --git a/sentieon_cli/__init__.py b/sentieon_cli/__init__.py
@@ -1,6 +1,6 @@
 import argh
 
-from . import dnascope_longread
+from . import dnascope, dnascope_longread
 
 
 def main():
@@ -24,7 +24,12 @@ def main():
         const="DEBUG",
     )
 
-    parser.add_commands([dnascope_longread.dnascope_longread])
+    parser.add_commands(
+        [
+            dnascope.dnascope,
+            dnascope_longread.dnascope_longread,
+        ]
+    )
     parser.dispatch()
 
 

diff --git a/sentieon_cli/command_strings.py b/sentieon_cli/command_strings.py
@@ -299,6 +299,103 @@ def cmd_samtools_fastq_minimap2(
     return " | ".join([shlex.join(x) for x in (cmd1, cmd2, *rg_cmds, cmd3)])
 
 
+def cmd_samtools_fastq_bwa(
+    out_aln: pathlib.Path,
+    input_aln: pathlib.Path,
+    reference: pathlib.Path,
+    model_bundle: pathlib.Path,
+    cores: int,
+    rg_header: pathlib.Path,
+    input_ref: Optional[pathlib.Path] = None,
+    collate: bool = False,
+    bwa_args: str = "-K 100000000",
+    fastq_taglist: str = "RG",
+    util_sort_args: str = "--cram_write_options version=3.0,compressor=rans",
+) -> str:
+    """Re-align an input BAM/CRAM/uBAM/uCRAM file with bwa"""
+    ref_cmd: List[str] = []
+    if input_ref:
+        ref_cmd = ["--reference", str(reference)]
+
+    if collate:
+        cmd0 = (
+            [
+                "samtools",
+                "collate",
+            ]
+            + ref_cmd
+            + [
+                "-@",
+                str(cores),
+                "-O",
+                "-u",
+                "-f",
+                str(input_aln),
+            ]
+        )
+        cmd1 = [
+            "samtools",
+            "fastq",
+            "-@",
+            str(cores),
+            "-T",
+            fastq_taglist,
+            "-",
+        ]
+    else:
+        cmd0 = []
+        cmd1 = (
+            [
+                "samtools",
+                "fastq",
+            ]
+            + ref_cmd
+            + [
+                "-@",
+                str(cores),
+                "-T",
+                fastq_taglist,
+                str(input_aln),
+            ]
+        )
+    cmd2 = (
+        [
+            "sentieon",
+            "bwa",
+            "mem",
+            "-p",
+            "-C",
+            "-H",
+            str(rg_header),
+            "-t",
+            str(cores),
+            "-x",
+            f"{model_bundle}/bwa.model",
+        ]
+        + bwa_args.split()
+        + [str(reference), "/dev/stdin"]
+    )
+    cmd3 = [
+        "sentieon",
+        "util",
+        "sort",
+        "-i",
+        "-",
+        "-t",
+        str(cores),
+        "--reference",
+        str(reference),
+        "-o",
+        str(out_aln),
+        "--sam2bam",
+    ] + util_sort_args.split()
+
+    if cmd0:
+        return " | ".join([shlex.join(x) for x in (cmd0, cmd1, cmd2, cmd3)])
+    else:
+        return " | ".join([shlex.join(x) for x in (cmd1, cmd2, cmd3)])
+
+
 def cmd_fastq_minimap2(
     out_aln: pathlib.Path,
     fastq: pathlib.Path,
@@ -344,3 +441,94 @@ def cmd_fastq_minimap2(
         "--sam2bam",
     ] + util_sort_args.split()
     return " | ".join([shlex.join(x) for x in (cmd1, cmd2, cmd3)])
+
+
+def cmd_fastq_bwa(
+    out_aln: pathlib.Path,
+    r1: pathlib.Path,
+    r2: Optional[pathlib.Path],
+    readgroup: str,
+    reference: pathlib.Path,
+    model_bundle: pathlib.Path,
+    cores: int,
+    unzip: str = "gzip",
+    bwa_args: str = "-K 100000000",
+    util_sort_args: str = "--cram_write_options version=3.0,compressor=rans",
+) -> str:
+    """Align an input fastq file with bwa"""
+    cmd1 = (
+        [
+            "sentieon",
+            "bwa",
+            "mem",
+            "-R",
+            readgroup,
+            "-t",
+            str(cores),
+            "-x",
+            f"{model_bundle}/bwa.model",
+        ]
+        + bwa_args.split()
+        + [
+            str(reference),
+        ]
+    )
+    cmd2 = [
+        str(unzip),
+        "-dc",
+        str(r1),
+    ]
+    cmd3 = []
+    if r2:
+        cmd3 = [
+            str(unzip),
+            "-dc",
+            str(r2),
+        ]
+    cmd4 = [
+        "sentieon",
+        "util",
+        "sort",
+        "-i",
+        "-",
+        "-t",
+        str(cores),
+        "--reference",
+        str(reference),
+        "-o",
+        str(out_aln),
+        "--sam2bam",
+    ] + util_sort_args.split()
+
+    cmds = [shlex.join(x) for x in (cmd1, cmd2, cmd3, cmd4)]
+    cmd_str = cmds[0] + " <(" + cmds[1] + ") "
+    if cmds[2]:
+        cmd_str += " <(" + cmds[2] + ") "
+    cmd_str += " | " + cmds[3]
+    return cmd_str
+
+
+def cmd_multiqc(
+    input_directory: pathlib.Path,
+    output_directory: pathlib.Path,
+    comment: Optional[str],
+) -> str:
+    """Run MultiQC"""
+    cmd = [
+        "multiqc",
+        "-o",
+        str(output_directory),
+    ]
+    if comment:
+        cmd.extend(
+            [
+                "-b",
+                comment,
+            ]
+        )
+    cmd.extend(
+        [
+            str(input_directory),
+        ]
+    )
+    return shlex.join(cmd)
diff --git a/sentieon_cli/dnascope.py b/sentieon_cli/dnascope.py
diff --git a/sentieon_cli/dnascope_longread.py b/sentieon_cli/dnascope_longread.py
@@ -74,7 +74,8 @@ def align_inputs(
     """
     if not skip_version_check:
         for cmd, min_version in ALN_MIN_VERSIONS.items():
-            check_version(cmd, min_version)
+            if not check_version(cmd, min_version):
+                sys.exit(2)
 
     res: List[pathlib.Path] = []
     suffix = "bam" if bam_format else "cram"
@@ -131,7 +132,8 @@ def align_fastq(
 
     if not skip_version_check:
         for cmd, min_version in FQ_MIN_VERSIONS.items():
-            check_version(cmd, min_version)
+            if not check_version(cmd, min_version):
+                sys.exit(2)
 
     unzip = "igzip"
     if not shutil.which(unzip):
@@ -198,7 +200,8 @@ def call_variants(
 
     if not skip_version_check:
         for cmd, min_version in TOOL_MIN_VERSIONS.items():
-            check_version(cmd, min_version)
+            if not check_version(cmd, min_version):
+                sys.exit(2)
 
     # First pass - diploid calling
     diploid_gvcf_fn = tmp_dir.joinpath("out_diploid.g.vcf.gz")
@@ -496,7 +499,8 @@ def call_svs(
     """
     if not skip_version_check:
         for cmd, min_version in SV_MIN_VERSIONS.items():
-            check_version(cmd, min_version)
+            if not check_version(cmd, min_version):
+                sys.exit(2)
 
     sv_vcf = pathlib.Path(str(output_vcf).replace(".vcf.gz", ".sv.vcf.gz"))
     driver = Driver(
@@ -645,7 +649,7 @@ def call_svs(
     action="store_true",
 )
 def dnascope_longread(
-    output_vcf: pathlib.Path,  # pylint: disable=W0613
+    output_vcf: pathlib.Path,
     reference: Optional[pathlib.Path] = None,
     sample_input: Optional[List[pathlib.Path]] = None,
     fastq: Optional[List[pathlib.Path]] = None,
@@ -678,6 +682,7 @@ def dnascope_longread(
     assert reference
     assert sample_input or fastq
     assert model_bundle
+    assert str(output_vcf).endswith(".vcf.gz")
 
     logger.setLevel(kwargs["loglevel"])
     logger.info("Starting sentieon-cli version: %s", __version__)

diff --git a/sentieon_cli/driver.py b/sentieon_cli/driver.py
@@ -110,11 +110,15 @@ def __init__(
         dbsnp: Optional[pathlib.Path] = None,
         emit_mode: str = "variant",
         model: Optional[pathlib.Path] = None,
+        pcr_indel_model: str = "CONSERVATIVE",
+        var_type: str = "SNP,INDEL",
     ):
         self.output = output
         self.dbsnp = dbsnp
         self.emit_mode = emit_mode
         self.model = model
+        self.pcr_indel_model = pcr_indel_model
+        self.var_type = var_type
 
 
 class DNAscopeHP(BaseAlgo):
@@ -157,6 +161,200 @@ def __init__(
         self.min_af = min_af
 
 
+class LocusCollector(BaseAlgo):
+    """algo LocusCollector"""
+
+    name = "LocusCollector"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        rna: bool = False,
+        consensus: bool = False,
+        umi_tag: Optional[str] = None,
+        umi_ecc_dist: Optional[int] = None,
+        umi_ecc_lev_dist: Optional[int] = None,
+    ):
+        self.output = output
+        self.rna = rna
+        self.consensus = consensus
+        self.umi_tag = umi_tag
+        self.umi_ecc_dist = umi_ecc_dist
+        self.umi_ecc_lev_dist = umi_ecc_lev_dist
+
+
+class Dedup(BaseAlgo):
+    """algo Dedup"""
+
+    name = "Dedup"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        score_info: pathlib.Path,
+        bam_compression: Optional[int] = None,
+        cram_write_options: Optional[str] = None,
+        metrics: Optional[pathlib.Path] = None,
+        rmdup: bool = False,
+    ):
+        self.output = output
+        self.score_info = score_info
+        self.bam_compression = bam_compression
+        self.cram_write_options = cram_write_options
+        self.metrics = metrics
+        self.rmdup = rmdup
+
+
+class GVCFtyper(BaseAlgo):
+    """algo GVCFtyper"""
+
+    name = "GVCFtyper"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        vcf: pathlib.Path,
+    ):
+        self.output = output
+        self.vcf = vcf
+
+
+class SVSolver(BaseAlgo):
+    """algo SVSolver"""
+
+    name = "SVSolver"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        vcf: pathlib.Path,
+    ):
+        self.output = output
+        self.vcf = vcf
+
+
+class InsertSizeMetricAlgo(BaseAlgo):
+    """algo InsertSizeMetricAlgo"""
+
+    name = "InsertSizeMetricAlgo"
+
+    def __init__(self, output: pathlib.Path):
+        self.output = output
+
+
+class MeanQualityByCycle(BaseAlgo):
+    """algo MeanQualityByCycle"""
+
+    name = "MeanQualityByCycle"
+
+    def __init__(self, output: pathlib.Path):
+        self.output = output
+
+
+class BaseDistributionByCycle(BaseAlgo):
+    """algo BaseDistributionByCycle"""
+
+    name = "BaseDistributionByCycle"
+
+    def __init__(self, output: pathlib.Path):
+        self.output = output
+
+
+class QualDistribution(BaseAlgo):
+    """algo QualDistribution"""
+
+    name = "QualDistribution"
+
+    def __init__(self, output: pathlib.Path):
+        self.output = output
+
+
+class GCBias(BaseAlgo):
+    """algo GCBias"""
+
+    name = "GCBias"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        summary: Optional[pathlib.Path] = None,
+    ):
+        self.output = output
+        self.summary = summary
+
+
+class AlignmentStat(BaseAlgo):
+    """algo AlignmentStat"""
+
+    name = "AlignmentStat"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        adapter_seq: str = "",
+    ):
+        self.output = output
+        self.adapter_seq = adapter_seq
+
+
+class CoverageMetrics(BaseAlgo):
+    """algo CoverageMetrics"""
+
+    name = "CoverageMetrics"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        omit_base_output: bool = True,
+    ):
+        self.output = output
+        self.omit_base_output = omit_base_output
+
+
+class HsMetricAlgo(BaseAlgo):
+    """algo HsMetricAlgo"""
+
+    name = "HsMetricAlgo"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        targets_list: pathlib.Path,
+        baits_list: pathlib.Path,
+    ):
+        self.output = output
+        self.targets_list = targets_list
+        self.baits_list = baits_list
+
+
+class SequenceArtifactMetricsAlgo(BaseAlgo):
+    """algo SequenceArtifactMetricsAlgo"""
+
+    name = "SequenceArtifactMetricsAlgo"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        dbsnp: Optional[pathlib.Path] = None,
+    ):
+        self.output = output
+        self.dbsnp = dbsnp
+
+
+class WgsMetricsAlgo(BaseAlgo):
+    """algo WgsMetricsAlgo"""
+
+    name = "WgsMetricsAlgo"
+
+    def __init__(
+        self,
+        output: pathlib.Path,
+        include_unpaired: Optional[str] = None,
+    ):
+        self.output = output
+        self.include_unpaired = include_unpaired
+
+
 class Driver:
     """Representing the Sentieon driver"""
 
@@ -165,6 +363,7 @@ def __init__(
         reference: Optional[pathlib.Path] = None,
         thread_count: Optional[int] = None,
         interval: Optional[Union[pathlib.Path, str]] = None,
+        interval_padding: int = 0,
         read_filter: Optional[str] = None,
         input: Optional[List[pathlib.Path]] = None,
         algo: Optional[List[BaseAlgo]] = None,
@@ -173,6 +372,7 @@ def __init__(
         self.input = input
         self.thread_count = thread_count
         self.interval = interval
+        self.interval_padding = interval_padding
         self.read_filter = read_filter
         self.algo = algo if algo else []
 

diff --git a/sentieon_cli/runner.py b/sentieon_cli/runner.py
@@ -11,5 +11,12 @@ def run(cmd: str):
     """Run a command."""
     logger.debug("running: %s", cmd)
     t0 = time.time()
-    sp.run(cmd, shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr)
+    sp.run(
+        cmd,
+        shell=True,
+        check=True,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        executable="/bin/bash",
+    )
     logger.debug("finished in: %s seconds", f"{time.time() - t0:.1f}")
diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py
@@ -8,15 +8,14 @@
 import re
 import shutil
 import subprocess as sp
-import sys
 import tempfile
 from typing import Callable, List, Optional
 
 import packaging.version
 
 from .logging import get_logger
 
-__version__ = "0.2.0"
+__version__ = "0.3.0"
 
 logger = get_logger(__name__)
 
@@ -31,16 +30,19 @@ def tmp():
     return tmp_dir
 
 
-def check_version(cmd: str, version: Optional[packaging.version.Version]):
+def check_version(
+    cmd: str,
+    version: Optional[packaging.version.Version],
+) -> bool:
     """Check the version of an executable"""
     cmd_list: List[str] = cmd.split()
     exec_file = shutil.which(cmd_list[0])
     if not exec_file:
-        print(f"Error: no '{cmd}' found in the PATH")
-        sys.exit(2)
+        logger.error("Error: no '%s' found in the PATH", cmd)
+        return False
 
     if version is None:
-        return
+        return True
 
     cmd_list.append("--version")
     cmd_version_str = sp.check_output(cmd_list).decode("utf-8").strip()
@@ -53,12 +55,16 @@ def check_version(cmd: str, version: Optional[packaging.version.Version]):
         )
     cmd_version = packaging.version.Version(cmd_version_str)
     if cmd_version < version:
-        print(
-            f"Error: the pipeline requires {cmd} version '{version}' or later "
-            f"but {cmd} '{cmd_version}' was found in the PATH"
+        logger.error(
+            "Error: the pipeline requires %s version '%s' or later "
+            "but %s '%s' was found in the PATH",
+            cmd,
+            version,
+            cmd,
+            cmd_version,
         )
-        sys.exit(2)
-    return
+        return False
+    return True
 
 
 def path_arg(

diff --git a/tests/smoke/illumina.cram b/tests/smoke/illumina.cram
diff --git a/tests/smoke/illumina.cram.crai b/tests/smoke/illumina.cram.crai