Skip to content

Commit

Permalink
feat: update multiqc to 1.22.3 (#1441)
Browse files Browse the repository at this point in the history
The new version of multiqc supports picard mimicked reports from Sentieon tools: MultiQC/MultiQC#2110

This should solve this issue: #1290 where an ugly solution was implemented in the Dedup rule to make MultiQC accept dedup-stats from Sentieon dedup. 

It may also allow us to move away from Picard to generate our QC reports and instead use the Sentieon tools which should be faster and enable us to clear away some rules for a more streamlined and less messy workflow. 

#### Added

- separate container for multiqc

#### Changed

- updated multiqc from 1.12 to 1.22.3

#### Removed

- no longer necessary sed command in dedup rule 
- deprecated and unused TNhaplotyper rule
  • Loading branch information
mathiasbio authored Jun 26, 2024
1 parent 45ff291 commit f173c27
Show file tree
Hide file tree
Showing 16 changed files with 95 additions and 109 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker_build_publish_develop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: true
matrix:
container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, msisensorpro, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure]
container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, multiqc, msisensorpro, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure]
steps:
- name: Git checkout
id: git_checkout
Expand Down
31 changes: 22 additions & 9 deletions BALSAMIC/assets/scripts/collect_qc_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,41 +186,54 @@ def get_metric_condition(
return req_metrics


def get_sample_id(multiqc_key: str) -> str:
"""Returns extracted sample ID from MultiQC data JSON key.
Example of possible sample-formats below from "report_saved_raw_data":
tumor.ACCXXXXXX
tumor.ACCXXXXXX_FR
ACCXXXXXX_align_sort_HMYLNDSXX_ACCXXXXXX_S165_L001
Returns
str: The extracted sample ID with the ACCXXXXXX format.
"""

if "_align_sort_" in multiqc_key:
return multiqc_key.split("_")[0]
return multiqc_key.split(".")[1].split("_")[0]


def get_multiqc_metrics(config: dict, multiqc_data: dict) -> list:
"""Extracts and returns the requested metrics from a multiqc JSON file"""

requested_metrics = get_requested_metrics(config, METRICS)

def extract(data, output_metrics, sample=None, source=None):
def extract(data, output_metrics, multiqc_key=None, source=None):
"""Recursively fetch metrics data from a nested multiqc JSON"""

if isinstance(data, dict):
for k in data:
# Ignore UMI and reverse reads metrics
if "umi" not in k:
if k in requested_metrics:
# example of possible sample-formats below from "report_saved_raw_data":
# tumor.ACCXXXXXX
# tumor.ACCXXXXXX_FR
# extracted below for id to: ACCXXXXXX
output_metrics.append(
Metric(
id=sample.split(".")[1].split("_")[0],
id=get_sample_id(multiqc_key),
input=get_multiqc_data_source(
multiqc_data, sample, source
multiqc_data, multiqc_key, source
),
name=k,
step=source,
value=data[k],
condition=get_metric_condition(
config,
requested_metrics,
sample.split(".")[1].split("_")[0],
get_sample_id(multiqc_key),
k,
),
).model_dump()
)
extract(data[k], output_metrics, k, sample)
extract(data[k], output_metrics, k, multiqc_key)

return output_metrics

Expand Down
2 changes: 1 addition & 1 deletion BALSAMIC/constants/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class PonParams:
BioinfoTools.FASTQC: DockerContainers.ALIGN_QC,
BioinfoTools.SAMTOOLS: DockerContainers.ALIGN_QC,
BioinfoTools.PICARD: DockerContainers.ALIGN_QC,
BioinfoTools.MULTIQC: DockerContainers.ALIGN_QC,
BioinfoTools.MULTIQC: DockerContainers.MULTIQC,
BioinfoTools.FASTP: DockerContainers.ALIGN_QC,
BioinfoTools.CSVKIT: DockerContainers.ALIGN_QC,
BioinfoTools.VEP: DockerContainers.ANNOTATE,
Expand Down
1 change: 1 addition & 0 deletions BALSAMIC/constants/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class DockerContainers(StrEnum):
DELLY: str = "delly"
GATK: str = "gatk"
HTSLIB: str = "htslib"
MULTIQC: str = "multiqc"
MSISENSORPRO: str = "msisensorpro"
PURECN: str = "purecn"
PYTHON_3: str = "varcall_py3"
Expand Down
1 change: 0 additions & 1 deletion BALSAMIC/containers/align_qc/align_qc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ dependencies:
- markdown=3.4.1
- markupsafe=2.1.1
- matplotlib-base=3.6.2
- multiqc=1.12
- munkres=1.1.4
- ncurses=6.4
- networkx=2.8.4
Expand Down
33 changes: 33 additions & 0 deletions BALSAMIC/containers/multiqc/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
FROM python:3.11.3-slim

LABEL base.image="python:3.11.3-slim" \
maintainer="Clinical Genomics" \
about.contact="[email protected]" \
about.home="https://github.com/MultiQC/MultiQC" \
software.version="1.22.3" \
about.documentation="https://multiqc.info/" \
about.license="MIT License (MIT)" \
about.description="Aggregate bioinformatics results across many samples into a single report"

# Set environment variables
ENV PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
USER=ubuntu \
HOME=/home/ubuntu

WORKDIR "$HOME"

# Install pip and multiqc, create user and home directory
RUN python -m pip install --upgrade pip && \
pip install multiqc==1.22.3 && \
adduser --disabled-password --gecos '' "$USER" && \
chown -R "$USER:$USER" "$HOME"

# Switch to the non-root user
USER "$USER"

# Set the default shell
SHELL ["/bin/bash", "-c"]

# Define the default command
CMD ["/bin/bash"]
Empty file.
1 change: 1 addition & 0 deletions BALSAMIC/containers/multiqc/multiqc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- multiqc=1.22.3
3 changes: 0 additions & 3 deletions BALSAMIC/snakemake_rules/align/sentieon_alignment.rule
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,6 @@ shell_bam_files=$(echo {input.bam_files} | sed 's/ / -i /g') ;
--score_info {output.score} \
--metrics {output.metrics} \
{output.bam};


sed 's/^LIBRARY/\\n## METRICS CLASS\tpicard\.sam\.DuplicationMetrics\\nLIBRARY/' -i {output.metrics}
"""

rule sentieon_realign:
Expand Down
48 changes: 0 additions & 48 deletions BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule
Original file line number Diff line number Diff line change
Expand Up @@ -73,53 +73,6 @@ rm -rf {params.tmpdir};
"""


rule sentieon_TNhaplotyper_tumor_only:
input:
bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
recal_data_table = expand(bam_dir + "tumor.merged.recal_data.table"),
ref = config["reference"]["reference_genome"],
dbsnp = config["reference"]["dbsnp"],
cosmic = config["reference"]["cosmic"],
output:
vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnhaplotyper.vcf.gz",
namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnhaplotyper.sample_name_map",
benchmark:
Path(benchmark_dir, "sentieon_TNhaplotyper_tumor_only_" + config["analysis"]["case_id"] + ".tsv").as_posix()
params:
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
tumor = "TUMOR",
pon = " " if get_pon(config) is None else " ".join(["--pon", get_pon(config)]),
pcr_model = params.common.pcr_model,
sentieon_exec = config["SENTIEON_EXEC"],
sentieon_lic = config["SENTIEON_LICENSE"],
case_name = config["analysis"]["case_id"]
threads:
get_threads(cluster_config, 'sentieon_TNhaplotyper_tumor_only')
message:
"Calling SNVs using sentieon TNhaplotyper for {params.case_name}"
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export SENTIEON_TMPDIR={params.tmpdir};
export SENTIEON_LICENSE={params.sentieon_lic};
{params.sentieon_exec} driver \
-r {input.ref} \
-t {threads} \
-i {input.bam} \
-q {input.recal_data_table} \
--algo TNhaplotyper \
--tumor_sample {params.tumor} {params.pon} \
--pcr_indel_mode {params.pcr_model} \
--cosmic {input.cosmic} \
--dbsnp {input.dbsnp} {output.vcf};
echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap};
rm -rf {params.tmpdir};
"""


rule sentieon_TNscope_tumor_only:
input:
ref = config["reference"]["reference_genome"],
Expand All @@ -139,7 +92,6 @@ rule sentieon_TNscope_tumor_only:
tumor_options = VARCALL_PARAMS["tnscope"]["tumor"],
pon = " " if get_pon(config) is None else " ".join(["--pon", get_pon(config)]),
pcr_model = params.common.pcr_model,
sentieon_ml_tnscope = config["SENTIEON_TNSCOPE"],
sentieon_exec = config["SENTIEON_EXEC"],
sentieon_lic = config["SENTIEON_LICENSE"],
case_name = config["analysis"]["case_id"]
Expand Down
45 changes: 0 additions & 45 deletions BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule
Original file line number Diff line number Diff line change
Expand Up @@ -80,48 +80,3 @@ echo '{{ vcf: {{ vardict: {{ name: vardict, path: {output.vcf_vardict} }} }} }}'
rm -rf {params.tmpdir};
"""


rule sentieon_TNhaplotyper:
input:
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample),
interval = config["panel"]["capture_kit"],
ref = config["reference"]["reference_genome"],
dbsnp = config["reference"]["dbsnp"],
output:
vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnhaplotyper.research.vcf.gz",
namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnhaplotyper.sample_name_map",
benchmark:
Path(benchmark_dir + 'sentieon_TNhaplotyper_' + config["analysis"]["case_id"] + ".tsv").as_posix()
params:
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
tumor = "TUMOR",
normal = "NORMAL",
sentieon_exec = config["SENTIEON_EXEC"],
sentieon_lic = config["SENTIEON_LICENSE"],
case_name = config["analysis"]["case_id"]
threads:
get_threads(cluster_config, 'sentieon_TNhaplotyper')
message:
"Calling single nucleotide variants using TNhaplotyper for {params.case_name}"
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export SENTIEON_TMPDIR={params.tmpdir};
export SENTIEON_LICENSE={params.sentieon_lic};
{params.sentieon_exec} driver \
-r {input.ref} \
-t {threads} \
-i {input.bamT} \
-i {input.bamN} \
--interval {input.interval} \
--algo TNhaplotyper \
--tumor_sample {params.tumor} \
--normal_sample {params.normal} \
--dbsnp {input.dbsnp} {output.vcf};
echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap};
rm -rf {params.tmpdir};
"""
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Changed:
^^^^^^^^
* Cluster scheduler script for immediate submit https://github.com/Clinical-Genomics/BALSAMIC/pull/1372
* `SLEEP_BEFORE_START` to 600s https://github.com/Clinical-Genomics/BALSAMIC/pull/1372
* Updated Multiqc to version 1.22.3 https://github.com/Clinical-Genomics/BALSAMIC/pull/1441
* Upgrade `vcf2cytosure` version to 0.9.1 and remove hardcoded versions https://github.com/Clinical-Genomics/BALSAMIC/pull/1456

Removed:
Expand All @@ -18,6 +19,10 @@ Removed:
* `gatk_contest` rule https://github.com/Clinical-Genomics/BALSAMIC/pull/1432
* SGE (qsub) support https://github.com/Clinical-Genomics/BALSAMIC/pull/1372

Fixed:
^^^^^^
* Corrected tool name in deduplication metrics https://github.com/Clinical-Genomics/BALSAMIC/pull/1441

[15.0.0]
--------

Expand All @@ -27,12 +32,14 @@ Added:
* New option for exome samples `--exome` with modified bcftools filters compared to standard targeted workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1414
* Custom samtools script for the detection of IGH::DUX4 rearrangements https://github.com/Clinical-Genomics/BALSAMIC/pull/1397


Changed:
^^^^^^^^
* Reduced stringency of minimum MQ for all TGA to 30 from 40 https://github.com/Clinical-Genomics/BALSAMIC/pull/1414
* Removed -u flag from VarDict T+N and T only rules to remove calling only in reverse reads of overlapping mates https://github.com/Clinical-Genomics/BALSAMIC/pull/1414
* Removed -U flag to VarDict T+N rule to start calling SVs https://github.com/Clinical-Genomics/BALSAMIC/pull/1414


Removed:
^^^^^^^^
* alt_allele_in_normal filter from TNscope T+N workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1289
Expand Down
15 changes: 15 additions & 0 deletions container_tests/multiqc/multiqc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
# Test if commands exist

valid_commands=( "multiqc" )

for valid_command in "${valid_commands[@]}"
do
if ! command -v "${valid_command}" &> /dev/null
then
echo "${valid_command} could not be found"
exit 1
else
echo "${valid_command} command is found and valid"
fi
done
2 changes: 1 addition & 1 deletion docs/bioinfo_softwares.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ multiqc
~~~~~~~
:Source code: `GitHub` `<https://github.com/ewels/MultiQC>`_
:Article: `Bioinformatics` `<https://doi.org/10.1093/bioinformatics/btw354>`_
:Version: `1.12`
:Version: `1.22.3`

picard
~~~~~~
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,6 +2031,7 @@ def fixture_develop_containers() -> Dict[str, str]:
"""Return a dictionary of docker hub containers for develop branch."""
return {
DockerContainers.ASCAT: "docker://clinicalgenomics/balsamic:develop-ascatNgs",
DockerContainers.MULTIQC: "docker://clinicalgenomics/balsamic:develop-multiqc",
DockerContainers.VCF2CYTOSURE: "docker://clinicalgenomics/balsamic:develop-vcf2cytosure",
DockerContainers.PYTHON_3: "docker://clinicalgenomics/balsamic:develop-varcall_py3",
DockerContainers.SOMALIER: "docker://clinicalgenomics/balsamic:develop-somalier",
Expand Down
12 changes: 12 additions & 0 deletions tests/scripts/test_collect_qc_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
get_variant_metrics,
get_metric_condition,
get_relatedness_metrics,
get_sample_id,
)


Expand Down Expand Up @@ -284,6 +285,17 @@ def test_collect_qc_metrics_counts(
assert Path(output_path).exists()


def test_get_sample_id(tumor_sample_name):
"""Tests sample ID extraction from multiqc_key."""
multiqc_sampleid_keys = [
f"tumor.{tumor_sample_name}",
f"tumor.{tumor_sample_name}_R1",
f"{tumor_sample_name}_align_sort_HMYLNDSXX_{tumor_sample_name}_S165_L001",
]
for multiqc_key in multiqc_sampleid_keys:
assert get_sample_id(multiqc_key) == tumor_sample_name


def test_get_relatedness_metrics(multiqc_data_dict):
"""Tests relatedness metrics retrieval."""

Expand Down

0 comments on commit f173c27

Please sign in to comment.