From b9b2cf33f37e0dd9f6545a27c42c61feed453710 Mon Sep 17 00:00:00 2001 From: DonFreed Date: Wed, 24 Jul 2024 14:15:05 -0400 Subject: [PATCH] Add mosdepth for alignment QC --- .github/workflows/main.yml | 18 ++++++----- sentieon_cli/command_strings.py | 22 +++++++++++++ sentieon_cli/dnascope_longread.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index daa59c0..aabaffe 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ jobs: matrix: python-version: ["3.8", "3.12"] poetry-version: ["1.5.1"] - sentieon-version: ["202308.01", "202308.02"] + sentieon-version: ["202308.01", "202308.03"] os: [ubuntu-22.04] #, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: @@ -48,6 +48,10 @@ jobs: run: | sudo curl -L -o /usr/local/bin/bedtools "https://github.com/arq5x/bedtools2/releases/download/v2.31.0/bedtools.static" sudo chmod ugo+x /usr/local/bin/bedtools + - name: Install mosdepth + run: | + sudo curl -L -o /usr/local/bin/mosdepth "https://github.com/brentp/mosdepth/releases/download/v0.3.8/mosdepth" + sudo chmod ugo+x /usr/local/bin/mosdepth - name: Install sentieon run: | curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf - @@ -78,7 +82,7 @@ jobs: - name: Smoke test - short-read run: | . .venv/bin/activate - sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \ + sentieon-cli -v dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \ --duplicate-marking rmdup --consensus --align \ --input_ref "tests/smoke/r ef.fa" -i "tests/smoke/illumina.cram" \ -m "DNAscope IlluminaWGS2.0.bundle" "output_sr.vcf.gz" @@ -90,7 +94,7 @@ jobs: . .venv/bin/activate samtools fastq --reference "tests/smoke/r ef.fa" -1 sr_r1.fastq.gz \ -2 sr_r2.fastq.gz "tests/smoke/illumina.cram" > /dev/null - sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \ + sentieon-cli -v dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \ --r1-fastq sr_r1.fastq.gz --r2-fastq sr_r2.fastq.gz \ --readgroups "@RG\tID:HG002-1\tSM:HG002" \ -m "DNAscope IlluminaWGS2.0.bundle" --assay WES \ @@ -101,7 +105,7 @@ jobs: - name: Smoke test - long-read run: | . .venv/bin/activate - sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ + sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ --repeat-model tests/smoke/sample_repeat.model -g \ -b "tests/smoke/diploid.bed" \ @@ -115,7 +119,7 @@ jobs: - name: Smoke test - gVCF run: | . .venv/bin/activate - sentieon-cli dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \ + sentieon-cli -v dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \ -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ --repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz" sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \ @@ -126,7 +130,7 @@ jobs: - name: Smoke test - realignment run: | . .venv/bin/activate - sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ + sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ --repeat-model tests/smoke/sample_repeat.model --align \ --input_ref "tests/smoke/r ef.fa" "output realigned.vcf.gz" @@ -139,7 +143,7 @@ jobs: samtools fastq --reference "tests/smoke/r ef.fa" \ "tests/smoke/sam ple.cram" | \ gzip -c > sample.fq.gz - sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ + sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ --fastq sample.fq.gz --readgroups '@RG\tID:sample-1\tSM:sample' \ -m "DNAscope PacBio2.1.bundle" \ --repeat-model tests/smoke/sample_repeat.model "output fq.vcf.gz" diff --git a/sentieon_cli/command_strings.py b/sentieon_cli/command_strings.py index 3893d5a..2d0e154 100644 --- a/sentieon_cli/command_strings.py +++ b/sentieon_cli/command_strings.py @@ -536,3 +536,25 @@ def cmd_multiqc( ] ) return shlex.join(cmd) + + +def cmd_mosdepth( + sample_input: pathlib.Path, + output_directory: pathlib.Path, + fasta: Optional[pathlib.Path] = None, + threads: int = 1, + xargs: str = ( + "--by 500 --no-per-base --use-median -T 1,3,5,10,15,20,30,40,50" + ), +) -> str: + cmd = [ + "mosdepth", + "--fasta", + str(fasta), + "--threads", + str(threads), + ] + cmd.extend(shlex.split(xargs)) + cmd.append(str(output_directory)) + cmd.append(str(sample_input)) + return shlex.join(cmd) diff --git a/sentieon_cli/dnascope_longread.py b/sentieon_cli/dnascope_longread.py index 74e7b4a..2474364 100644 --- a/sentieon_cli/dnascope_longread.py +++ b/sentieon_cli/dnascope_longread.py @@ -53,6 +53,10 @@ "sentieon driver": packaging.version.Version("202308"), } +MOSDEPTH_MIN_VERSIONS = { + "mosdepth": packaging.version.Version("0.2.6"), +} + def align_inputs( run: Callable[[str], None], @@ -521,6 +525,45 @@ def call_svs( return 0 +def mosdepth( + run: Callable[[str], None], + output_vcf: pathlib.Path, + reference: pathlib.Path, + sample_input: List[pathlib.Path], + cores: int = mp.cpu_count(), + skip_version_check: bool = False, + **_kwargs: Any, +) -> int: + """Run mosdepth for QC""" + + if not skip_version_check: + if not all( + [ + check_version(cmd, min_version) + for (cmd, min_version) in MOSDEPTH_MIN_VERSIONS.items() + ] + ): + logger.warning( + "Skipping mosdepth. mosdepth version %s or later not found", + MOSDEPTH_MIN_VERSIONS["mosdepth"], + ) + return 1 + + mosdepth_dir = pathlib.Path( + str(output_vcf).replace(".vcf.gz", "_mosdepth") + ) + for input_file in sample_input: + run( + cmds.cmd_mosdepth( + input_file, + mosdepth_dir, + fasta=reference, + threads=cores, + ) + ) + return 0 + + @arg( "-r", "--reference", @@ -610,6 +653,10 @@ def call_svs( "--skip-svs", help="Skip SV calling", ) +@arg( + "--skip-mosdepth", + help="Skip QC with mosdepth", +) @arg( "--align", help="Align the input BAM/CRAM/uBAM file to the reference genome", @@ -666,6 +713,7 @@ def dnascope_longread( dry_run: bool = False, skip_small_variants: bool = False, skip_svs: bool = False, + skip_mosdepth: bool = False, align: bool = False, input_ref: Optional[pathlib.Path] = None, fastq_taglist: str = "*", # pylint: disable=W0613 @@ -708,6 +756,9 @@ def dnascope_longread( sample_input = align_inputs(**locals()) sample_input.extend(align_fastq(**locals())) + if not skip_mosdepth: + _res = mosdepth(**locals()) + if not skip_small_variants: res = call_variants(**locals()) if res != 0: