Skip to content

Commit

Permalink
Add mosdepth for alignment QC
Browse files Browse the repository at this point in the history
  • Loading branch information
DonFreed committed Jul 24, 2024
1 parent 5d82012 commit b9b2cf3
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 7 deletions.
18 changes: 11 additions & 7 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
matrix:
python-version: ["3.8", "3.12"]
poetry-version: ["1.5.1"]
sentieon-version: ["202308.01", "202308.02"]
sentieon-version: ["202308.01", "202308.03"]
os: [ubuntu-22.04] #, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
Expand Down Expand Up @@ -48,6 +48,10 @@ jobs:
run: |
sudo curl -L -o /usr/local/bin/bedtools "https://github.com/arq5x/bedtools2/releases/download/v2.31.0/bedtools.static"
sudo chmod ugo+x /usr/local/bin/bedtools
- name: Install mosdepth
run: |
sudo curl -L -o /usr/local/bin/mosdepth "https://github.com/brentp/mosdepth/releases/download/v0.3.8/mosdepth"
sudo chmod ugo+x /usr/local/bin/mosdepth
- name: Install sentieon
run: |
curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf -
Expand Down Expand Up @@ -78,7 +82,7 @@ jobs:
- name: Smoke test - short-read
run: |
. .venv/bin/activate
sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \
sentieon-cli -v dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free -g \
--duplicate-marking rmdup --consensus --align \
--input_ref "tests/smoke/r ef.fa" -i "tests/smoke/illumina.cram" \
-m "DNAscope IlluminaWGS2.0.bundle" "output_sr.vcf.gz"
Expand All @@ -90,7 +94,7 @@ jobs:
. .venv/bin/activate
samtools fastq --reference "tests/smoke/r ef.fa" -1 sr_r1.fastq.gz \
-2 sr_r2.fastq.gz "tests/smoke/illumina.cram" > /dev/null
sentieon-cli dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \
sentieon-cli -v dnascope -t 1 -r "tests/smoke/r ef.fa" --pcr-free \
--r1-fastq sr_r1.fastq.gz --r2-fastq sr_r2.fastq.gz \
--readgroups "@RG\tID:HG002-1\tSM:HG002" \
-m "DNAscope IlluminaWGS2.0.bundle" --assay WES \
Expand All @@ -101,7 +105,7 @@ jobs:
- name: Smoke test - long-read
run: |
. .venv/bin/activate
sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model -g \
-b "tests/smoke/diploid.bed" \
Expand All @@ -115,7 +119,7 @@ jobs:
- name: Smoke test - gVCF
run: |
. .venv/bin/activate
sentieon-cli dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz"
sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \
Expand All @@ -126,7 +130,7 @@ jobs:
- name: Smoke test - realignment
run: |
. .venv/bin/activate
sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model --align \
--input_ref "tests/smoke/r ef.fa" "output realigned.vcf.gz"
Expand All @@ -139,7 +143,7 @@ jobs:
samtools fastq --reference "tests/smoke/r ef.fa" \
"tests/smoke/sam ple.cram" | \
gzip -c > sample.fq.gz
sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
sentieon-cli -v dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
--fastq sample.fq.gz --readgroups '@RG\tID:sample-1\tSM:sample' \
-m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model "output fq.vcf.gz"
Expand Down
22 changes: 22 additions & 0 deletions sentieon_cli/command_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,3 +536,25 @@ def cmd_multiqc(
]
)
return shlex.join(cmd)


def cmd_mosdepth(
sample_input: pathlib.Path,
output_directory: pathlib.Path,
fasta: Optional[pathlib.Path] = None,
threads: int = 1,
xargs: str = (
"--by 500 --no-per-base --use-median -T 1,3,5,10,15,20,30,40,50"
),
) -> str:
cmd = [
"mosdepth",
"--fasta",
str(fasta),
"--threads",
str(threads),
]
cmd.extend(shlex.split(xargs))
cmd.append(str(output_directory))
cmd.append(str(sample_input))
return shlex.join(cmd)
51 changes: 51 additions & 0 deletions sentieon_cli/dnascope_longread.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@
"sentieon driver": packaging.version.Version("202308"),
}

MOSDEPTH_MIN_VERSIONS = {
"mosdepth": packaging.version.Version("0.2.6"),
}


def align_inputs(
run: Callable[[str], None],
Expand Down Expand Up @@ -521,6 +525,45 @@ def call_svs(
return 0


def mosdepth(
run: Callable[[str], None],
output_vcf: pathlib.Path,
reference: pathlib.Path,
sample_input: List[pathlib.Path],
cores: int = mp.cpu_count(),
skip_version_check: bool = False,
**_kwargs: Any,
) -> int:
"""Run mosdepth for QC"""

if not skip_version_check:
if not all(
[
check_version(cmd, min_version)
for (cmd, min_version) in MOSDEPTH_MIN_VERSIONS.items()
]
):
logger.warning(
"Skipping mosdepth. mosdepth version %s or later not found",
MOSDEPTH_MIN_VERSIONS["mosdepth"],
)
return 1

mosdepth_dir = pathlib.Path(
str(output_vcf).replace(".vcf.gz", "_mosdepth")
)
for input_file in sample_input:
run(
cmds.cmd_mosdepth(
input_file,
mosdepth_dir,
fasta=reference,
threads=cores,
)
)
return 0


@arg(
"-r",
"--reference",
Expand Down Expand Up @@ -610,6 +653,10 @@ def call_svs(
"--skip-svs",
help="Skip SV calling",
)
@arg(
"--skip-mosdepth",
help="Skip QC with mosdepth",
)
@arg(
"--align",
help="Align the input BAM/CRAM/uBAM file to the reference genome",
Expand Down Expand Up @@ -666,6 +713,7 @@ def dnascope_longread(
dry_run: bool = False,
skip_small_variants: bool = False,
skip_svs: bool = False,
skip_mosdepth: bool = False,
align: bool = False,
input_ref: Optional[pathlib.Path] = None,
fastq_taglist: str = "*", # pylint: disable=W0613
Expand Down Expand Up @@ -708,6 +756,9 @@ def dnascope_longread(
sample_input = align_inputs(**locals())
sample_input.extend(align_fastq(**locals()))

if not skip_mosdepth:
_res = mosdepth(**locals())

if not skip_small_variants:
res = call_variants(**locals())
if res != 0:
Expand Down

0 comments on commit b9b2cf3

Please sign in to comment.