From dc4bef2c61026bab6472478ccc94c3651a3c27aa Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Mon, 21 Aug 2023 10:38:33 -0500 Subject: [PATCH 01/51] modifies gzip decompressing cmd - https://github.com/openjournals/joss-reviews/issues/5313#issuecomment-1686290957 --- src/setup_dependency_datasets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/setup_dependency_datasets.sh b/src/setup_dependency_datasets.sh index 0208e62..a091432 100755 --- a/src/setup_dependency_datasets.sh +++ b/src/setup_dependency_datasets.sh @@ -16,7 +16,7 @@ mkdir -p $REF_GENOME_DIR && cd $REF_GENOME_DIR REF_GENOME_FNAME="GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz" REF_GENOME_URL=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/${REF_GENOME_FNAME} curl -L $REF_GENOME_URL -o $REF_GENOME_FNAME -gzip -d $REF_GENOME_FNAME +gzip -c $REF_GENOME_FNAME >$(basename $REF_GENOME_FNAME .gz) ##### retrieve somalier tool dependencies ##### echo "setting up somalier dependency datasets..." From 3d8ef7aa93ce6e1e10cef9b4c1b3665632088884 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:16:46 -0500 Subject: [PATCH 02/51] adds pedigree dir --- .test/configs/include_priorQC/{ => pedigree}/project_1sample.ped | 0 .test/configs/include_priorQC/{ => pedigree}/project_2samples.ped | 0 .test/configs/no_priorQC/{ => pedigree}/project_1sample.ped | 0 .test/configs/no_priorQC/{ => pedigree}/project_2samples.ped | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename .test/configs/include_priorQC/{ => pedigree}/project_1sample.ped (100%) rename .test/configs/include_priorQC/{ => pedigree}/project_2samples.ped (100%) rename .test/configs/no_priorQC/{ => pedigree}/project_1sample.ped (100%) rename .test/configs/no_priorQC/{ => pedigree}/project_2samples.ped (100%) diff --git a/.test/configs/include_priorQC/project_1sample.ped b/.test/configs/include_priorQC/pedigree/project_1sample.ped similarity index 100% rename from .test/configs/include_priorQC/project_1sample.ped rename to .test/configs/include_priorQC/pedigree/project_1sample.ped diff --git a/.test/configs/include_priorQC/project_2samples.ped b/.test/configs/include_priorQC/pedigree/project_2samples.ped similarity index 100% rename from .test/configs/include_priorQC/project_2samples.ped rename to .test/configs/include_priorQC/pedigree/project_2samples.ped diff --git a/.test/configs/no_priorQC/project_1sample.ped b/.test/configs/no_priorQC/pedigree/project_1sample.ped similarity index 100% rename from .test/configs/no_priorQC/project_1sample.ped rename to .test/configs/no_priorQC/pedigree/project_1sample.ped diff --git a/.test/configs/no_priorQC/project_2samples.ped b/.test/configs/no_priorQC/pedigree/project_2samples.ped similarity index 100% rename from .test/configs/no_priorQC/project_2samples.ped rename to .test/configs/no_priorQC/pedigree/project_2samples.ped From 5ddfbda56149301a46db675e14937c8aef9e6149 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:28:36 -0500 Subject: [PATCH 03/51] adds a sample config file --- .test/configs/no_priorQC/sample_config/project_2samples.tsv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .test/configs/no_priorQC/sample_config/project_2samples.tsv diff --git a/.test/configs/no_priorQC/sample_config/project_2samples.tsv b/.test/configs/no_priorQC/sample_config/project_2samples.tsv new file mode 100644 index 0000000..61e979f --- /dev/null +++ b/.test/configs/no_priorQC/sample_config/project_2samples.tsv @@ -0,0 +1,3 @@ +sample_id bam vcf +C .test/ngs-data/test_project/analysis/C/bam/C.bam .test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz +D .test/ngs-data/test_project/analysis/D/bam/D.bam .test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz From 335403e290fbafda98f72940a9aa6f81d1f5b34c Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:31:30 -0500 Subject: [PATCH 04/51] adds script to read sample config --- src/read_sample_config.py | 45 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/read_sample_config.py diff --git a/src/read_sample_config.py b/src/read_sample_config.py new file mode 100644 index 0000000..a150ec5 --- /dev/null +++ b/src/read_sample_config.py @@ -0,0 +1,45 @@ +""" +Read user sample config file, verify filepaths exist and return dictionary +""" + +from pathlib import Path +import csv + + +def get_full_path(x): + full_path = Path(x).resolve() + + return str(full_path) + + +def is_valid_file(fpath): + if not Path(fpath).is_file(): + print(f"File provided in sample config file was not found: {fpath}") + raise SystemExit(1) + + return get_full_path(fpath) + + +def read_sample_config(config_f): + + with open(config_f) as fh: + csv_reader = csv.DictReader(fh, delimiter="\t") + + samples_dict = {} + for row in csv_reader: + bam = is_valid_file(row["bam"]) + vcf = is_valid_file(row["vcf"]) + + sample = row["sample_id"].strip(" ") + if sample in samples_dict: + print(f"ERROR: Sample '{sample}' found >1x in config file '{config_f}'") + raise SystemExit(1) + + samples_dict[sample] = {"vcf": vcf, "bam": bam} + + return samples_dict + + +if __name__ == "__main__": + SAMPLES_CONFIG_F = ".test/configs/no_priorQC/sample_config/project_2samples.tsv" + read_sample_config(SAMPLES_CONFIG_F) \ No newline at end of file From 604af7f225549afdf6cc5dd45e556c910ea5375e Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:49:34 -0500 Subject: [PATCH 05/51] refactors to use file inputs from sample config --- src/run_quac.py | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/src/run_quac.py b/src/run_quac.py index ecf51ab..9df125e 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -19,6 +19,7 @@ import yaml from singularity_status import test_singularity from slurm.submit_slurm_job import submit_slurm_job +from read_sample_config import read_sample_config def make_dir(d): @@ -91,8 +92,7 @@ def read_workflow_config(workflow_config_fpath): def gather_mount_paths( - projects_path, - project_name, + sample_config, pedigree_path, out_dir, log_dir, @@ -105,10 +105,14 @@ def gather_mount_paths( mount_paths = set() - # project path - project_path = Path(projects_path) / project_name / "analysis" - make_dir(project_path) - mount_paths.add(project_path) + # sample_config + mount_paths.add(Path(sample_config).parent) + + # input filepaths from sample config + samples_dict = read_sample_config(sample_config) + for sample_val in samples_dict.values(): + for val_fpath in sample_val.values(): + mount_paths.add(Path(val_fpath).parent) # pedigree filepath mount_paths.add(Path(pedigree_path).parent) @@ -168,8 +172,6 @@ def create_snakemake_command(args, repo_path, mount_paths): tmp_dir = args.tmp_dir quac_configs = { - "project_name": args.project_name, - "projects_path": args.projects_path, "ped": args.pedigree, "quac_watch_config": args.quac_watch_config, "workflow_config": args.workflow_config, @@ -234,8 +236,7 @@ def main(args): # process user's input-output config file and get singularity bind paths mount_paths = gather_mount_paths( - args.projects_path, - args.project_name, + args.sample_config, args.pedigree, args.outdir, args.log_dir, @@ -270,7 +271,7 @@ def main(args): "resources": slurm_resources, } - submit_slurm_job(pipeline_cmd, job_dict) + # submit_slurm_job(pipeline_cmd, job_dict) return None @@ -314,19 +315,14 @@ def create_dirpath(arg): WORKFLOW = PARSER.add_argument_group("QuaC snakemake workflow options") WORKFLOW.add_argument( - "--project_name", - help="Project name. Required.", - required=True, - ) - WORKFLOW.add_argument( - "--projects_path", - help="Path where all projects are hosted. Do not include project name here. Required.", - type=lambda x: is_valid_dir(PARSER, x), + "--sample_config", + help="Sample config file in TSV format. Provides sample name and necessary input filepaths (bam, vcf, etc.)", + type=lambda x: is_valid_file(PARSER, x), required=True, ) WORKFLOW.add_argument( "--pedigree", - help="Pedigree filepath. Must correspond to the project supplied via --project_name. Required.", + help="Pedigree filepath. Must correspond to samples mentioned in configfile via --sample_config. Required.", type=lambda x: is_valid_file(PARSER, x), required=True, ) @@ -427,9 +423,5 @@ def create_dirpath(arg): raise SystemExit( "Error. Quac-watch config is missing. Please supply using --quac_watch_config." ) - if not ARGS.projects_path: - raise SystemExit( - "Error. 'Projects path' not provided. Please supply using --projects_path." - ) main(ARGS) From d998273f4692fbfac8ff66ffdee746987f61d67b Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:49:44 -0500 Subject: [PATCH 06/51] updates testing command --- docs/system_testing.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/system_testing.md b/docs/system_testing.md index eb84a59..7cb1316 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -42,9 +42,8 @@ PRIOR_QC_STATUS="no_priorQC" # WGS mode python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/no_priorQC/sample_config/project_2samples.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ --workflow_config "configs/workflow.yaml" \ From f940c0f48a02081575df6d3b73a7eb9454d1bd43 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:50:02 -0500 Subject: [PATCH 07/51] uncomments --- src/run_quac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/run_quac.py b/src/run_quac.py index 9df125e..b5d1d1e 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -271,7 +271,7 @@ def main(args): "resources": slurm_resources, } - # submit_slurm_job(pipeline_cmd, job_dict) + submit_slurm_job(pipeline_cmd, job_dict) return None From a80e05daa7cce514affaa9b0ab1e6721e5066802 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 09:55:28 -0500 Subject: [PATCH 08/51] refactors workflow to get samples from sample config --- workflow/rules/common.smk | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e19b2a6..093f378 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,21 +1,29 @@ +import csv import re from pathlib import PurePath from snakemake.logging import logger -def get_samples(ped_fpath): - """ - Parse pedigree file and return sample names - """ - samples = () - with open(ped_fpath, "r") as f_handle: - for line in f_handle: - if line.startswith("#"): - continue - sample = line.split("\t")[1] - samples += (sample,) +# TODO: refactor to import from src/read_sample_config.py +def read_sample_config(config_f): + "read sample config file and return map of samples to their input filepaths" + + with open(config_f) as fh: + csv_reader = csv.DictReader(fh, delimiter="\t") + + samples_dict = {} + for row in csv_reader: + bam = is_valid_file(row["bam"]) + vcf = is_valid_file(row["vcf"]) + + sample = row["sample_id"].strip(" ") + if sample in samples_dict: + print(f"ERROR: Sample '{sample}' found >1x in config file '{config_f}'") + raise SystemExit(1) + + samples_dict[sample] = {"vcf": vcf, "bam": bam} - return samples + return samples_dict def is_testing_mode(): @@ -91,7 +99,8 @@ INCLUDE_PRIOR_QC_DATA = config["include_prior_qc_data"] RULE_LOGS_PATH = Path(config["log_dir"]) / "rule_logs" RULE_LOGS_PATH.mkdir(parents=True, exist_ok=True) -SAMPLES = get_samples(PEDIGREE_FPATH) +SAMPLES_CONFIG = read_sample_config(config["sample_config"]) +SAMPLES = list(SAMPLES_CONFIG.keys()) MULTIQC_CONFIG_FILE = OUT_DIR / "project_level_qc" / "multiqc" / "configs" / f"tmp_multiqc_config-{config['unique_id']}.yaml" logger.info(f"// Processing project: {PROJECT_NAME}") From 9e11930624bfee42325a554b04bee252979645a0 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 10:13:10 -0500 Subject: [PATCH 09/51] adds missing input --- src/run_quac.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/run_quac.py b/src/run_quac.py index b5d1d1e..5be75d3 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -172,6 +172,7 @@ def create_snakemake_command(args, repo_path, mount_paths): tmp_dir = args.tmp_dir quac_configs = { + "sample_config": args.sample_config, "ped": args.pedigree, "quac_watch_config": args.quac_watch_config, "workflow_config": args.workflow_config, From df5c1485683dea3340625af3617b6d08f513915b Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 10:23:53 -0500 Subject: [PATCH 10/51] refactors workflow to use input from sample configfile --- workflow/Snakefile | 2 -- workflow/rules/common.smk | 11 ++++---- workflow/rules/coverage_analysis.smk | 28 ++++++++----------- workflow/rules/relatedness_ancestry.smk | 4 +-- workflow/rules/vcf_stats.smk | 4 +-- .../rules/within_species_contamintation.smk | 4 +-- 6 files changed, 22 insertions(+), 31 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 82e0321..861205e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,6 +1,5 @@ """ QuaC Pipeline to perform QC on bams and vcfs. -Works at project level. """ WORKFLOW_PATH = Path(workflow.basedir).parent @@ -24,7 +23,6 @@ include: "rules/aggregate_results.smk" ############ CONSTRAINTS ############ wildcard_constraints: sample="|".join(SAMPLES), - project=PROJECT_NAME, ######################################### diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 093f378..8d514cb 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -13,8 +13,8 @@ def read_sample_config(config_f): samples_dict = {} for row in csv_reader: - bam = is_valid_file(row["bam"]) - vcf = is_valid_file(row["vcf"]) + bam = row["bam"] + vcf = row["vcf"] sample = row["sample_id"].strip(" ") if sample in samples_dict: @@ -88,8 +88,6 @@ def get_small_var_pipeline_targets(wildcards): ########################## Configs from CLI ########################## OUT_DIR = Path(config["out_dir"]) -PROJECT_NAME = config["project_name"] -PROJECT_PATH = Path(config["projects_path"]) / PROJECT_NAME / "analysis" PEDIGREE_FPATH = config["ped"] EXOME_MODE = config["exome"] ALLOW_SAMPLE_RENAMING = config["allow_sample_renaming"] @@ -100,10 +98,11 @@ RULE_LOGS_PATH = Path(config["log_dir"]) / "rule_logs" RULE_LOGS_PATH.mkdir(parents=True, exist_ok=True) SAMPLES_CONFIG = read_sample_config(config["sample_config"]) +print ([value["bam"] for value in SAMPLES_CONFIG.values()]) SAMPLES = list(SAMPLES_CONFIG.keys()) MULTIQC_CONFIG_FILE = OUT_DIR / "project_level_qc" / "multiqc" / "configs" / f"tmp_multiqc_config-{config['unique_id']}.yaml" -logger.info(f"// Processing project: {PROJECT_NAME}") -logger.info(f'// Project path: "{PROJECT_PATH}"') +logger.info(f"// Sample configfile: {config['sample_config']}") +logger.info(f'// Output directory: "{OUT_DIR}"') logger.info(f"// Exome mode: {EXOME_MODE}") logger.info(f"// Include prior QC data: {INCLUDE_PRIOR_QC_DATA}") diff --git a/workflow/rules/coverage_analysis.smk b/workflow/rules/coverage_analysis.smk index 4c9b2d8..5865ebb 100644 --- a/workflow/rules/coverage_analysis.smk +++ b/workflow/rules/coverage_analysis.smk @@ -1,7 +1,7 @@ ########################## Samtools ########################## rule samtools_stats: input: - PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", + lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] output: protected(OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt"), singularity: @@ -19,8 +19,8 @@ rule samtools_stats: ########################## Qualimap ########################## rule qualimap_bamqc: input: - bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", + bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], + bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", target_regions=get_capture_regions_bed if EXOME_MODE else [], output: html_report=protected(OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html"), @@ -53,8 +53,8 @@ rule qualimap_bamqc: ########################## Picard ########################## rule picard_collect_multiple_metrics: input: - bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", + bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], + bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", ref=config["datasets"]["ref"], output: multiext( @@ -82,8 +82,8 @@ rule picard_collect_multiple_metrics: rule picard_collect_wgs_metrics: input: - bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", + bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], + bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", ref=config["datasets"]["ref"], output: OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.collect_wgs_metrics", @@ -103,8 +103,8 @@ rule picard_collect_wgs_metrics: ########################## Mosdepth ########################## rule mosdepth_coverage: input: - bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", + bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], + bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", target_regions=get_capture_regions_bed if EXOME_MODE else [], output: dist=protected(OUT_DIR / "{sample}" / "qc" / "mosdepth" / "{sample}.mosdepth.global.dist.txt"), @@ -153,14 +153,8 @@ rule mosdepth_plot: ########################## indexcov ########################## rule indexcov: input: - bam=expand( - PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - sample=SAMPLES, - ), - bam_index=expand( - PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", - sample=SAMPLES, - ), + bam=[value["bam"] for value in SAMPLES_CONFIG.values()], + bam_index=[value["bam"]+".bai" for value in SAMPLES_CONFIG.values()], output: html=protected(OUT_DIR / "project_level_qc" / "indexcov" / "index.html"), bed=protected(OUT_DIR / "project_level_qc" / "indexcov" / "indexcov-indexcov.bed.gz"), diff --git a/workflow/rules/relatedness_ancestry.smk b/workflow/rules/relatedness_ancestry.smk index a1db793..a8b27d2 100644 --- a/workflow/rules/relatedness_ancestry.smk +++ b/workflow/rules/relatedness_ancestry.smk @@ -1,7 +1,7 @@ rule somalier_extract: input: - bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", + bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], + bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", sites=config["datasets"]["somalier"]["sites"], ref_genome=config["datasets"]["ref"], output: diff --git a/workflow/rules/vcf_stats.smk b/workflow/rules/vcf_stats.smk index 300b936..9d036fd 100644 --- a/workflow/rules/vcf_stats.smk +++ b/workflow/rules/vcf_stats.smk @@ -1,6 +1,6 @@ rule bcftools_stats: input: - PROJECT_PATH / "{sample}" / "vcf" / "{sample}.vcf.gz", + lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["vcf"], output: protected(OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats"), message: @@ -17,7 +17,7 @@ rule bcftools_stats: rule bcftools_index: input: - PROJECT_PATH / "{sample}" / "vcf" / "{sample}.vcf.gz", + lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["vcf"], output: protected(OUT_DIR / "{sample}" / "qc" / "bcftools-index" / "{sample}.bcftools.index.tsv"), message: diff --git a/workflow/rules/within_species_contamintation.smk b/workflow/rules/within_species_contamintation.smk index a23dcbc..3877c10 100644 --- a/workflow/rules/within_species_contamintation.smk +++ b/workflow/rules/within_species_contamintation.smk @@ -7,8 +7,8 @@ def get_svd(wildcards): rule verifybamid: input: - bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", - bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", + bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], + bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", ref_genome=config["datasets"]["ref"], svd=get_svd, output: From aa3dfa0993bec19f1d2542ba0b9be1613ee525fb Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 10:31:52 -0500 Subject: [PATCH 11/51] refactors testing mode identification --- workflow/rules/common.smk | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 8d514cb..99916f0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -30,9 +30,11 @@ def is_testing_mode(): "checks if testing dataset is used as input for the pipeline" query = ".test" - if query in PurePath(PROJECT_PATH).parts: - logger.info(f"// WARNING: '{query}' present in the path supplied via --projects_path. So testing mode is used.") - return True + for sample in SAMPLES_CONFIG.values(): + for fpath in sample.values(): + if query in PurePath(fpath).parts: + logger.info(f"// WARNING: '{query}' present in at least one of the filepaths supplied via --sample_config. So testing mode is used.") + return True return None From d5a8d5efb9cb1f77fa9b4ed0c9056758dfb29143 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 11:32:24 -0500 Subject: [PATCH 12/51] updates doc on cli help --- docs/quac_cli.md | 21 ++++++++++----------- src/run_quac.py | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/quac_cli.md b/docs/quac_cli.md index 8bbf810..459e132 100644 --- a/docs/quac_cli.md +++ b/docs/quac_cli.md @@ -6,10 +6,9 @@ wrapper/CLI (command line interface) tool `src/run_quac.py`. ## Command line interface ```sh -$ python src/run_quac.py -h -usage: run_quac.py [-h] --project_name PROJECT_NAME --projects_path - PROJECTS_PATH --pedigree PEDIGREE --quac_watch_config - QUAC_WATCH_CONFIG [--workflow_config] +$ python src/run_quac.py -h +usage: run_quac.py [-h] --sample_config SAMPLE_CONFIG --pedigree PEDIGREE + --quac_watch_config QUAC_WATCH_CONFIG [--workflow_config] [--snakemake_cluster_config] [--outdir] [--tmp_dir] [--exome] [--include_prior_qc] [--allow_sample_renaming] [-e] [-n] [--cli_cluster_config] [--log_dir] @@ -20,13 +19,13 @@ optional arguments: -h, --help show this help message and exit QuaC snakemake workflow options: - --project_name PROJECT_NAME - Project name. Required. (default: None) - --projects_path PROJECTS_PATH - Path where all projects are hosted. Do not include - project name here. Required. (default: None) - --pedigree PEDIGREE Pedigree filepath. Must correspond to the project - supplied via --project_name. Required. (default: None) + --sample_config SAMPLE_CONFIG + Sample config file in TSV format. Provides sample name + and necessary input filepaths (bam, vcf, etc.). + Required. (default: None) + --pedigree PEDIGREE Pedigree filepath. Must correspond to samples + mentioned in configfile via --sample_config. Required. + (default: None) --quac_watch_config QUAC_WATCH_CONFIG YAML config path specifying QC thresholds for QuaC- Watch. See directory 'configs/quac_watch/' in quac diff --git a/src/run_quac.py b/src/run_quac.py index 5be75d3..ac7c85e 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -317,7 +317,7 @@ def create_dirpath(arg): WORKFLOW.add_argument( "--sample_config", - help="Sample config file in TSV format. Provides sample name and necessary input filepaths (bam, vcf, etc.)", + help="Sample config file in TSV format. Provides sample name and necessary input filepaths (bam, vcf, etc.). Required.", type=lambda x: is_valid_file(PARSER, x), required=True, ) From 6bef22da873057edf15f69d958aceba72d68de28 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 11:33:19 -0500 Subject: [PATCH 13/51] turns off a rule related to --allow_sample_renaming as it causes error --- workflow/rules/aggregate_results.smk | 46 ++++++++++++++-------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index ec5c96f..b7bcc0f 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -164,30 +164,30 @@ rule multiqc_by_sample_final_pass: ########################## Multi-sample QC aggregation ########################## -rule aggregate_sample_rename_configs: - input: - expand( - PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv", - sample=SAMPLES, - ), - output: - outfile=protected(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv"), - tempfile=temp(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "flist.txt"), - message: - "Aggregate all sample rename-config files." - singularity: - "docker://quay.io/biocontainers/mulled-v2-78a02249d8cc4e85718933e89cf41d0e6686ac25:70df245247aac9844ee84a9da1e96322a24c1f34-0" - shell: - r""" - # save files in a tempfile - echo {input} \ - | tr " " "\n" \ - > {output.tempfile} +# rule aggregate_sample_rename_configs: +# input: +# expand( +# PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv", +# sample=SAMPLES, +# ), +# output: +# outfile=protected(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv"), +# tempfile=temp(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "flist.txt"), +# message: +# "Aggregate all sample rename-config files." +# singularity: +# "docker://quay.io/biocontainers/mulled-v2-78a02249d8cc4e85718933e89cf41d0e6686ac25:70df245247aac9844ee84a9da1e96322a24c1f34-0" +# shell: +# r""" +# # save files in a tempfile +# echo {input} \ +# | tr " " "\n" \ +# > {output.tempfile} - python src/aggregate_sample_rename_configs.py \ - --infile {output.tempfile} \ - --outfile {output.outfile} - """ +# python src/aggregate_sample_rename_configs.py \ +# --infile {output.tempfile} \ +# --outfile {output.outfile} +# """ rule multiqc_aggregation_all_samples: From 11eb239ae21ace8816a6120d64653638cb65df66 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 11:47:58 -0500 Subject: [PATCH 14/51] updates changelog --- docs/Changelog.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/Changelog.md b/docs/Changelog.md index de893a6..1e66314 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -12,6 +12,18 @@ YYYY-MM-DD John Doe ``` --- +2023-10-05 Manavalan Gajapathy + +* Refactors to accept sample filepaths via user-provided sample config file. Only for WGS mode in minimal manner (w/o + --include_prior_qc, --allow_sample_renaming) (#86) +* Adds sample config file to use with system testing datasets - + `.test/configs/no_priorQC/sample_config/project_2samples.tsv`. This provides map of sample name to their VCF and BAM + filepaths. +* Refactors use of `--sample_config` arg to work with this config file as input +* Deprecates args `--project_name` and `--projects_path` +* Modifies workflow to use the new input setup +* Updates README concerning the changes made + 2023-07-17 Manavalan Gajapathy * Minor updates to documentation. From 79763706c30051de65ac9d58eb8ce07946840c67 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 12:48:48 -0500 Subject: [PATCH 15/51] adds test exome sample config --- .../no_priorQC/sample_config/project_2samples_exome.tsv | 3 +++ .../{project_2samples.tsv => project_2samples_wgs.tsv} | 0 2 files changed, 3 insertions(+) create mode 100644 .test/configs/no_priorQC/sample_config/project_2samples_exome.tsv rename .test/configs/no_priorQC/sample_config/{project_2samples.tsv => project_2samples_wgs.tsv} (100%) diff --git a/.test/configs/no_priorQC/sample_config/project_2samples_exome.tsv b/.test/configs/no_priorQC/sample_config/project_2samples_exome.tsv new file mode 100644 index 0000000..396661e --- /dev/null +++ b/.test/configs/no_priorQC/sample_config/project_2samples_exome.tsv @@ -0,0 +1,3 @@ +sample_id bam vcf capture_bed +C .test/ngs-data/test_project/analysis/C/bam/C.bam .test/ngs-data/test_project/analysis/C/vcf/C.vcf.gz .test/ngs-data/test_project/analysis/C/configs/small_variant_caller/capture_regions.bed +D .test/ngs-data/test_project/analysis/D/bam/D.bam .test/ngs-data/test_project/analysis/D/vcf/D.vcf.gz .test/ngs-data/test_project/analysis/D/configs/small_variant_caller/capture_regions.bed diff --git a/.test/configs/no_priorQC/sample_config/project_2samples.tsv b/.test/configs/no_priorQC/sample_config/project_2samples_wgs.tsv similarity index 100% rename from .test/configs/no_priorQC/sample_config/project_2samples.tsv rename to .test/configs/no_priorQC/sample_config/project_2samples_wgs.tsv From bfcd9b35a3413a5626a35dc6194d8c30665af19d Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 12:49:05 -0500 Subject: [PATCH 16/51] updates commands in doc --- docs/system_testing.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/system_testing.md b/docs/system_testing.md index 7cb1316..841731e 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -42,7 +42,7 @@ PRIOR_QC_STATUS="no_priorQC" # WGS mode python src/run_quac.py \ - --sample_config ".test/configs/no_priorQC/sample_config/project_2samples.tsv" \ + --sample_config ".test/configs/no_priorQC/sample_config/project_2samples_wgs.tsv" \ --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ @@ -51,9 +51,8 @@ python src/run_quac.py \ # Exome mode python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/no_priorQC/sample_config/project_2samples_exome.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_exome-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --workflow_config "configs/workflow.yaml" \ From 27322594f0823f988c578fc52d609a16bf3a17f6 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 12:54:22 -0500 Subject: [PATCH 17/51] reads capture bed column --- src/read_sample_config.py | 12 +++++++++++- workflow/rules/common.smk | 10 +++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/read_sample_config.py b/src/read_sample_config.py index a150ec5..3cba078 100644 --- a/src/read_sample_config.py +++ b/src/read_sample_config.py @@ -14,7 +14,7 @@ def get_full_path(x): def is_valid_file(fpath): if not Path(fpath).is_file(): - print(f"File provided in sample config file was not found: {fpath}") + print(f"ERROR: File provided in sample config file was not found: {fpath}") raise SystemExit(1) return get_full_path(fpath) @@ -37,6 +37,16 @@ def read_sample_config(config_f): samples_dict[sample] = {"vcf": vcf, "bam": bam} + for colname in ["capture_bed"]: + if colname in row: + samples_dict[sample][colname] = is_valid_file(row[colname]) + + if colname == "capture_bed" and not row["capture_bed"].endswith(".bed"): + print( + f"ERROR: Capture bed filename is required to end with '.bed' extension but it did not: '{row['capture_bed']}'" + ) + raise SystemExit(1) + return samples_dict diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 99916f0..7b5eec2 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -23,6 +23,10 @@ def read_sample_config(config_f): samples_dict[sample] = {"vcf": vcf, "bam": bam} + for colname in ["capture_bed"]: + if colname in row: + samples_dict[sample][colname] = row[colname] + return samples_dict @@ -95,13 +99,13 @@ EXOME_MODE = config["exome"] ALLOW_SAMPLE_RENAMING = config["allow_sample_renaming"] INCLUDE_PRIOR_QC_DATA = config["include_prior_qc_data"] +SAMPLES_CONFIG = read_sample_config(config["sample_config"]) +SAMPLES = list(SAMPLES_CONFIG.keys()) + #### configs from configfile #### RULE_LOGS_PATH = Path(config["log_dir"]) / "rule_logs" RULE_LOGS_PATH.mkdir(parents=True, exist_ok=True) -SAMPLES_CONFIG = read_sample_config(config["sample_config"]) -print ([value["bam"] for value in SAMPLES_CONFIG.values()]) -SAMPLES = list(SAMPLES_CONFIG.keys()) MULTIQC_CONFIG_FILE = OUT_DIR / "project_level_qc" / "multiqc" / "configs" / f"tmp_multiqc_config-{config['unique_id']}.yaml" logger.info(f"// Sample configfile: {config['sample_config']}") From f0bb7ddd38bd14b63914c387efc8b1a6e6da33ba Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 13:05:13 -0500 Subject: [PATCH 18/51] refactors getting capture bedfile --- workflow/rules/common.smk | 21 --------------------- workflow/rules/coverage_analysis.smk | 4 ++-- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7b5eec2..0f1d4cf 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -43,27 +43,6 @@ def is_testing_mode(): return None -def get_capture_regions_bed(wildcards): - "returns capture bed file (if any) used by small variant caller pipeline" - - config_dir = PROJECT_PATH / wildcards.sample / "configs" / "small_variant_caller" - compressed_bed = list(config_dir.glob("*.bed.gz")) - if compressed_bed: - logger.error( - f"ERROR: Compressed capture bed file found for sample {wildcards.sample}, " - "but it is not supported by some QC tools used in QuaC (eg. qualimap). " - f"Use compressed bed file instead. - {compressed_bed}" - ) - raise SystemExit(1) - - bed = list(config_dir.glob("*.bed")) - if len(bed) != 1: - logger.error(f"ERROR: No or >1 capture bed file found for sample {wildcards.sample} - {bed}") - raise SystemExit(1) - - return bed - - def get_small_var_pipeline_targets(wildcards): """ Returns target files that are output by small variant caller pipeline. diff --git a/workflow/rules/coverage_analysis.smk b/workflow/rules/coverage_analysis.smk index 5865ebb..e465087 100644 --- a/workflow/rules/coverage_analysis.smk +++ b/workflow/rules/coverage_analysis.smk @@ -21,7 +21,7 @@ rule qualimap_bamqc: input: bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", - target_regions=get_capture_regions_bed if EXOME_MODE else [], + target_regions=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["capture_bed"] if EXOME_MODE else [], output: html_report=protected(OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html"), coverage=protected(OUT_DIR / "{sample}" / "qc/qualimap/{sample}/raw_data_qualimapReport/coverage_across_reference.txt"), @@ -105,7 +105,7 @@ rule mosdepth_coverage: input: bam=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"], bam_index=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["bam"] + ".bai", - target_regions=get_capture_regions_bed if EXOME_MODE else [], + target_regions=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["capture_bed"] if EXOME_MODE else [], output: dist=protected(OUT_DIR / "{sample}" / "qc" / "mosdepth" / "{sample}.mosdepth.global.dist.txt"), summary=protected(OUT_DIR / "{sample}" / "qc" / "mosdepth" / "{sample}.mosdepth.summary.txt"), From effa2762c245c73c28c6ea12557d7f56c4e0383c Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 13:15:21 -0500 Subject: [PATCH 19/51] updates changelog --- docs/Changelog.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/Changelog.md b/docs/Changelog.md index 1e66314..352ff01 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -14,6 +14,13 @@ YYYY-MM-DD John Doe 2023-10-05 Manavalan Gajapathy +* Refactors to accept sample filepaths via user-provided sample config file. Only for exome mode in minimal manner (w/o + --include_prior_qc, --allow_sample_renaming) (#86) +* Adds a test sample config file +* Refactors to get capture bed file as input from the sample configfile + +2023-10-05 Manavalan Gajapathy + * Refactors to accept sample filepaths via user-provided sample config file. Only for WGS mode in minimal manner (w/o --include_prior_qc, --allow_sample_renaming) (#86) * Adds sample config file to use with system testing datasets - From 7bad7906a2fa1f12270d2407f7ecf70fdbe5f39f Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 14:40:24 -0500 Subject: [PATCH 20/51] returns header; removes file suffix verification --- src/read_sample_config.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/read_sample_config.py b/src/read_sample_config.py index 3cba078..dfc9ed4 100644 --- a/src/read_sample_config.py +++ b/src/read_sample_config.py @@ -14,7 +14,7 @@ def get_full_path(x): def is_valid_file(fpath): if not Path(fpath).is_file(): - print(f"ERROR: File provided in sample config file was not found: {fpath}") + print(f"ERROR: File provided in sample config file was not found: '{fpath}'") raise SystemExit(1) return get_full_path(fpath) @@ -24,6 +24,7 @@ def read_sample_config(config_f): with open(config_f) as fh: csv_reader = csv.DictReader(fh, delimiter="\t") + colnames = csv_reader.fieldnames samples_dict = {} for row in csv_reader: @@ -41,13 +42,7 @@ def read_sample_config(config_f): if colname in row: samples_dict[sample][colname] = is_valid_file(row[colname]) - if colname == "capture_bed" and not row["capture_bed"].endswith(".bed"): - print( - f"ERROR: Capture bed filename is required to end with '.bed' extension but it did not: '{row['capture_bed']}'" - ) - raise SystemExit(1) - - return samples_dict + return samples_dict, colnames if __name__ == "__main__": From 96fa05b194de98e9bc2a60c05d13fe7745bed926 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 14:43:07 -0500 Subject: [PATCH 21/51] reads sample configfile and verifies columns exist --- src/run_quac.py | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/src/run_quac.py b/src/run_quac.py index ac7c85e..929ef73 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -40,6 +40,38 @@ def get_tool_path(tool): return tool_path +def check_sample_configs(fpath, exome_mode, include_prior_qc, allow_sample_renaming): + """ + reads from sample configfile and verifies necessary column names exist + """ + + samples_dict, header = read_sample_config(fpath) + + if exome_mode: + if "capture_bed" not in header: + print( + f"ERROR: Flag --exome supplied but required column 'capture_bed' is missing in sample configfile '{fpath}'" + ) + raise SystemExit(1) + + for sample in samples_dict: + if not samples_dict[sample]["capture_bed"].endswith(".bed"): + print( + f"ERROR: Capture bed filename is required to end with '.bed' extension: '{samples_dict[sample]['capture_bed']}'" + ) + raise SystemExit(1) + + # TODO + if include_prior_qc and "TODO" not in header: + pass + + # TODO + if allow_sample_renaming and "TODO" not in header: + pass + + return samples_dict + + def check_mount_paths_exist(paths): """ Verify the paths to be mounted to Singularity exist @@ -92,7 +124,8 @@ def read_workflow_config(workflow_config_fpath): def gather_mount_paths( - sample_config, + sample_config_f, + samples_config_dict, pedigree_path, out_dir, log_dir, @@ -106,11 +139,10 @@ def gather_mount_paths( mount_paths = set() # sample_config - mount_paths.add(Path(sample_config).parent) + mount_paths.add(Path(sample_config_f).parent) # input filepaths from sample config - samples_dict = read_sample_config(sample_config) - for sample_val in samples_dict.values(): + for sample_val in samples_config_dict.values(): for val_fpath in sample_val.values(): mount_paths.add(Path(val_fpath).parent) @@ -235,9 +267,15 @@ def main(args): # check singularity works properly in user's machine test_singularity() + # read sample configfile and verify necessary columns exist + samples_config_dict = check_sample_configs( + args.sample_config, args.exome, args.include_prior_qc, args.allow_sample_renaming + ) + # process user's input-output config file and get singularity bind paths mount_paths = gather_mount_paths( args.sample_config, + samples_config_dict, args.pedigree, args.outdir, args.log_dir, From 05c37e21df5094d22ba98f435eaae39b5ca55a98 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 15:21:04 -0500 Subject: [PATCH 22/51] adds test priorqc sample configfile --- .../include_priorQC/sample_config/project_2samples_wgs.tsv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv diff --git a/.test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv b/.test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv new file mode 100644 index 0000000..64636a8 --- /dev/null +++ b/.test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv @@ -0,0 +1,3 @@ +sample_id bam vcf fastqc_raw fastqc_trimmed fastq_screen dedup multiqc_rename_config +A .test/ngs-data/test_project/analysis/A/bam/A.bam .test/ngs-data/test_project/analysis/A/vcf/A.vcf.gz .test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-1-R1_screen.txt,.test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-1-R2_screen.txt,.test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-2-R1_screen.txt,.test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-2-R2_screen.txt .test/ngs-data/test_project/analysis/A/qc/dedup/A-1.metrics.txt,.test/ngs-data/test_project/analysis/A/qc/dedup/A-2.metrics.txt .test/ngs-data/test_project/analysis/A/qc/multiqc_initial_pass/multiqc_sample_rename_config/A_rename_config.tsv +B .test/ngs-data/test_project/analysis/B/bam/B.bam .test/ngs-data/test_project/analysis/B/vcf/B.vcf.gz .test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-1-R1_screen.txt,.test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-1-R2_screen.txt,.test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-2-R1_screen.txt,.test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-2-R2_screen.txt .test/ngs-data/test_project/analysis/B/qc/dedup/B-1.metrics.txt,.test/ngs-data/test_project/analysis/B/qc/dedup/B-2.metrics.txt .test/ngs-data/test_project/analysis/B/qc/multiqc_initial_pass/multiqc_sample_rename_config/B_rename_config.tsv From 3cb821f6e55d9df01c440722c78da3a05a57d9ae Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 18:08:51 -0500 Subject: [PATCH 23/51] reads priorqc specific columns --- src/read_sample_config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/read_sample_config.py b/src/read_sample_config.py index dfc9ed4..5b1bc3b 100644 --- a/src/read_sample_config.py +++ b/src/read_sample_config.py @@ -38,10 +38,16 @@ def read_sample_config(config_f): samples_dict[sample] = {"vcf": vcf, "bam": bam} + # expect only filepath per field for colname in ["capture_bed"]: if colname in row: samples_dict[sample][colname] = is_valid_file(row[colname]) + # expect >=1 filepath per field + for colname in ["fastqc_raw", "fastqc_trimmed", "fastq_screen", "dedup"]: + if colname in row: + samples_dict[sample][colname] = [is_valid_file(f) for f in row[colname].split(",")] + return samples_dict, colnames From 282fb5137e3721f38763b573b71360b689365a2f Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 18:09:25 -0500 Subject: [PATCH 24/51] fixes mounting paths --- src/run_quac.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/run_quac.py b/src/run_quac.py index 929ef73..9910b10 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -144,7 +144,11 @@ def gather_mount_paths( # input filepaths from sample config for sample_val in samples_config_dict.values(): for val_fpath in sample_val.values(): - mount_paths.add(Path(val_fpath).parent) + if isinstance(val_fpath, str): + mount_paths.add(Path(val_fpath).parent) + elif isinstance(val_fpath, list): + for item in val_fpath: + mount_paths.add(Path(item).parent) # pedigree filepath mount_paths.add(Path(pedigree_path).parent) From b37fecede8a9d238deabec8af8e779b2b5692fa4 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 18:09:44 -0500 Subject: [PATCH 25/51] updates command in doc --- docs/system_testing.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/system_testing.md b/docs/system_testing.md index 841731e..2dde5ae 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -66,16 +66,16 @@ PRIOR_QC_STATUS="include_priorQC" # WGS mode python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ --include_prior_qc \ - --allow_sample_renaming \ --workflow_config "configs/workflow.yaml" \ $USE_SLURM + # --allow_sample_renaming \ + # Exome mode python src/run_quac.py \ --project_name test_project \ From ec604c793810f5b4b5cf8e72cb4c98bf437fd58a Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 18:13:48 -0500 Subject: [PATCH 26/51] refactors workflow --- workflow/rules/aggregate_results.smk | 16 +++-------- workflow/rules/common.smk | 40 ++++++++++++---------------- 2 files changed, 20 insertions(+), 36 deletions(-) diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index b7bcc0f..71a32c0 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -25,7 +25,7 @@ rule create_multiqc_config: ########################## Single-sample-level QC aggregation ########################## rule multiqc_by_sample_initial_pass: input: - get_small_var_pipeline_targets if INCLUDE_PRIOR_QC_DATA else [], + lambda wildcards: get_priorQC_filepaths(wildcards.sample, SAMPLES_CONFIG) if INCLUDE_PRIOR_QC_DATA else [], OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt", OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html", OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.alignment_summary_metrics", @@ -126,7 +126,7 @@ rule quac_watch: rule multiqc_by_sample_final_pass: input: - get_small_var_pipeline_targets if INCLUDE_PRIOR_QC_DATA else [], + lambda wildcards: get_priorQC_filepaths(wildcards.sample, SAMPLES_CONFIG) if INCLUDE_PRIOR_QC_DATA else [], OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt", OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html", OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.alignment_summary_metrics", @@ -192,17 +192,7 @@ rule multiqc_by_sample_final_pass: rule multiqc_aggregation_all_samples: input: - expand( - [ - PROJECT_PATH / "{sample}" / "qc" / "fastqc-raw" / "{sample}-{unit}-{read}_fastqc.zip", - PROJECT_PATH / "{sample}" / "qc" / "fastqc-trimmed" / "{sample}-{unit}-{read}_fastqc.zip", - PROJECT_PATH / "{sample}" / "qc" / "fastq_screen-trimmed" / "{sample}-{unit}-{read}_screen.txt", - PROJECT_PATH / "{sample}" / "qc" / "dedup" / "{sample}-{unit}.metrics.txt", - ], - sample=SAMPLES, - unit=[1], - read=["R1", "R2"], - ) if INCLUDE_PRIOR_QC_DATA else [], + [get_priorQC_filepaths(sample, SAMPLES_CONFIG) for sample in SAMPLES_CONFIG.keys()] if INCLUDE_PRIOR_QC_DATA else [], expand( [ OUT_DIR / "project_level_qc" / "somalier" / "relatedness" / "somalier.html", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0f1d4cf..ab54045 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -23,9 +23,15 @@ def read_sample_config(config_f): samples_dict[sample] = {"vcf": vcf, "bam": bam} + # expect only filepath per field for colname in ["capture_bed"]: if colname in row: samples_dict[sample][colname] = row[colname] + + # expect >=1 filepath per field + for colname in ["fastqc_raw", "fastqc_trimmed", "fastq_screen", "dedup"]: + if colname in row: + samples_dict[sample][colname] = row[colname].split(",") return samples_dict @@ -43,32 +49,20 @@ def is_testing_mode(): return None -def get_small_var_pipeline_targets(wildcards): + +def get_priorQC_filepaths(sample, samples_dict): """ - Returns target files that are output by small variant caller pipeline. - Uses deduplication's output files as proxy to identify "units" + Returns filepaths relevant to priorQC """ - flist = (PROJECT_PATH / wildcards.sample / "qc" / "dedup").glob("*.metrics.txt") - units = [] - for fpath in flist: - unit = re.match(fr"{wildcards.sample}-(\d+).metrics.txt", fpath.name) - units.append(unit.group(1)) - - targets = ( - expand( - [ - PROJECT_PATH / "{{sample}}" / "qc" / "fastqc-raw" / "{{sample}}-{unit}-{read}_fastqc.zip", - PROJECT_PATH / "{{sample}}" / "qc" / "fastqc-trimmed" / "{{sample}}-{unit}-{read}_fastqc.zip", - PROJECT_PATH / "{{sample}}" / "qc" / "fastq_screen-trimmed" / "{{sample}}-{unit}-{read}_screen.txt", - PROJECT_PATH / "{{sample}}" / "qc" / "dedup" / "{{sample}}-{unit}.metrics.txt", - ], - unit=units, - read=["R1", "R2"], - ), - ) - - return targets[0] + column_list = ["fastqc_raw", "fastqc_trimmed", "fastq_screen", "dedup"] + file_list = [] + for column in column_list: + file_list.append(samples_dict[sample][column]) + + flat_filelist = [item for sublist in file_list for item in sublist] + + return flat_filelist ########################## Configs from CLI ########################## From c5f882f0ea2f0c7bb1ef019a92721b1f19fa3ec8 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 18:20:35 -0500 Subject: [PATCH 27/51] adds sample config for exome mode --- .../sample_config/project_2samples_exome.tsv | 3 +++ docs/system_testing.md | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 .test/configs/include_priorQC/sample_config/project_2samples_exome.tsv diff --git a/.test/configs/include_priorQC/sample_config/project_2samples_exome.tsv b/.test/configs/include_priorQC/sample_config/project_2samples_exome.tsv new file mode 100644 index 0000000..f9d8592 --- /dev/null +++ b/.test/configs/include_priorQC/sample_config/project_2samples_exome.tsv @@ -0,0 +1,3 @@ +sample_id bam vcf capture_bed fastqc_raw fastqc_trimmed fastq_screen dedup multiqc_rename_config +A .test/ngs-data/test_project/analysis/A/bam/A.bam .test/ngs-data/test_project/analysis/A/vcf/A.vcf.gz .test/ngs-data/test_project/analysis/A/configs/small_variant_caller/capture_regions.bed .test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-raw/A-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/A/qc/fastqc-trimmed/A-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-1-R1_screen.txt,.test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-1-R2_screen.txt,.test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-2-R1_screen.txt,.test/ngs-data/test_project/analysis/A/qc/fastq_screen-trimmed/A-2-R2_screen.txt .test/ngs-data/test_project/analysis/A/qc/dedup/A-1.metrics.txt,.test/ngs-data/test_project/analysis/A/qc/dedup/A-2.metrics.txt .test/ngs-data/test_project/analysis/A/qc/multiqc_initial_pass/multiqc_sample_rename_config/A_rename_config.tsv +B .test/ngs-data/test_project/analysis/B/bam/B.bam .test/ngs-data/test_project/analysis/B/vcf/B.vcf.gz .test/ngs-data/test_project/analysis/B/configs/small_variant_caller/capture_regions.bed .test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-raw/B-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-1-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-1-R2_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-2-R1_fastqc.zip,.test/ngs-data/test_project/analysis/B/qc/fastqc-trimmed/B-2-R2_fastqc.zip .test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-1-R1_screen.txt,.test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-1-R2_screen.txt,.test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-2-R1_screen.txt,.test/ngs-data/test_project/analysis/B/qc/fastq_screen-trimmed/B-2-R2_screen.txt .test/ngs-data/test_project/analysis/B/qc/dedup/B-1.metrics.txt,.test/ngs-data/test_project/analysis/B/qc/dedup/B-2.metrics.txt .test/ngs-data/test_project/analysis/B/qc/multiqc_initial_pass/multiqc_sample_rename_config/B_rename_config.tsv diff --git a/docs/system_testing.md b/docs/system_testing.md index 2dde5ae..aebf994 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -73,21 +73,19 @@ python src/run_quac.py \ --include_prior_qc \ --workflow_config "configs/workflow.yaml" \ $USE_SLURM - # --allow_sample_renaming \ # Exome mode python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/include_priorQC/sample_config/project_2samples_exome.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_exome-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --exome \ --include_prior_qc \ - --allow_sample_renaming \ --workflow_config "configs/workflow.yaml" \ $USE_SLURM + # --allow_sample_renaming \ ``` !!! note From 72e999f99b180625b0329d84068630a511026e2c Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 18:26:39 -0500 Subject: [PATCH 28/51] updates changelog --- docs/Changelog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/Changelog.md b/docs/Changelog.md index 352ff01..1141c6c 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -14,6 +14,11 @@ YYYY-MM-DD John Doe 2023-10-05 Manavalan Gajapathy +* Refactors to accept sample filepaths via user-provided sample config file, when `--include_prior_qc` is used (#86) +* Adds a test sample config file that includes priorQC filepaths + +2023-10-05 Manavalan Gajapathy + * Refactors to accept sample filepaths via user-provided sample config file. Only for exome mode in minimal manner (w/o --include_prior_qc, --allow_sample_renaming) (#86) * Adds a test sample config file From 9ebb14e68ef19b3321b5f1b794c73fdff510e3f6 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:23:32 -0500 Subject: [PATCH 29/51] ensures expected columns are present --- src/run_quac.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/run_quac.py b/src/run_quac.py index 9910b10..d044bcf 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -61,9 +61,14 @@ def check_sample_configs(fpath, exome_mode, include_prior_qc, allow_sample_renam ) raise SystemExit(1) - # TODO - if include_prior_qc and "TODO" not in header: - pass + if include_prior_qc: + columns = ["fastqc_raw", "fastqc_trimmed", "fastq_screen", "dedup"] + missing_columns = list(set(columns).difference(set(header))) + if len(missing_columns): + print( + f"ERROR: Columns missing in sample config file but needed when flag --include_prior_qc is used: {missing_columns}" + ) + raise SystemExit(1) # TODO if allow_sample_renaming and "TODO" not in header: @@ -314,7 +319,7 @@ def main(args): "resources": slurm_resources, } - submit_slurm_job(pipeline_cmd, job_dict) + # submit_slurm_job(pipeline_cmd, job_dict) return None From 5536441b4b5bd1b6bf806c856c0aae484924ccb9 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:26:08 -0500 Subject: [PATCH 30/51] uncomments --- src/run_quac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/run_quac.py b/src/run_quac.py index d044bcf..1d8649b 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -319,7 +319,7 @@ def main(args): "resources": slurm_resources, } - # submit_slurm_job(pipeline_cmd, job_dict) + submit_slurm_job(pipeline_cmd, job_dict) return None From 563d166cb166bac9f3eeef6f7b3fb464360e8b09 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:25:28 -0500 Subject: [PATCH 31/51] updates command in doc --- docs/system_testing.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/system_testing.md b/docs/system_testing.md index aebf994..1f7c1a1 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -71,9 +71,9 @@ python src/run_quac.py \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ --include_prior_qc \ + --allow_sample_renaming \ --workflow_config "configs/workflow.yaml" \ $USE_SLURM - # --allow_sample_renaming \ # Exome mode python src/run_quac.py \ @@ -83,9 +83,9 @@ python src/run_quac.py \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --exome \ --include_prior_qc \ + --allow_sample_renaming \ --workflow_config "configs/workflow.yaml" \ $USE_SLURM - # --allow_sample_renaming \ ``` !!! note From 4f5668f6f232a5992ccbe04f8344090a3c902bc7 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:29:40 -0500 Subject: [PATCH 32/51] checks column in present --- src/run_quac.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/run_quac.py b/src/run_quac.py index 1d8649b..3ee04f7 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -70,9 +70,11 @@ def check_sample_configs(fpath, exome_mode, include_prior_qc, allow_sample_renam ) raise SystemExit(1) - # TODO - if allow_sample_renaming and "TODO" not in header: - pass + if allow_sample_renaming and "multiqc_rename_config" not in header: + print( + f"ERROR: Flag --allow_sample_renaming supplied but required column 'multiqc_rename_config' is missing in sample configfile '{fpath}'" + ) + raise SystemExit(1) return samples_dict From f3498616350166ac4dbec05ebbf35ee4e8a00b13 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:37:34 -0500 Subject: [PATCH 33/51] reads flag specific column --- src/read_sample_config.py | 6 ++++-- workflow/rules/common.smk | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/read_sample_config.py b/src/read_sample_config.py index 5b1bc3b..bcbdfb2 100644 --- a/src/read_sample_config.py +++ b/src/read_sample_config.py @@ -39,14 +39,16 @@ def read_sample_config(config_f): samples_dict[sample] = {"vcf": vcf, "bam": bam} # expect only filepath per field - for colname in ["capture_bed"]: + for colname in ["capture_bed", "multiqc_rename_config"]: if colname in row: samples_dict[sample][colname] = is_valid_file(row[colname]) # expect >=1 filepath per field for colname in ["fastqc_raw", "fastqc_trimmed", "fastq_screen", "dedup"]: if colname in row: - samples_dict[sample][colname] = [is_valid_file(f) for f in row[colname].split(",")] + samples_dict[sample][colname] = [ + is_valid_file(f) for f in row[colname].split(",") + ] return samples_dict, colnames diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ab54045..3a53064 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -24,7 +24,7 @@ def read_sample_config(config_f): samples_dict[sample] = {"vcf": vcf, "bam": bam} # expect only filepath per field - for colname in ["capture_bed"]: + for colname in ["capture_bed", "multiqc_rename_config"]: if colname in row: samples_dict[sample][colname] = row[colname] From 9546916bcf3ed7fa1e3b76dffb2b7fb70d7b6ec7 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:38:12 -0500 Subject: [PATCH 34/51] refactors input to use from sample config --- workflow/rules/aggregate_results.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index 71a32c0..730bd3f 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -33,7 +33,7 @@ rule multiqc_by_sample_initial_pass: OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry", OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats", multiqc_config=MULTIQC_CONFIG_FILE, - rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv" if ALLOW_SAMPLE_RENAMING else [], + rename_config=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["multiqc_rename_config"] if ALLOW_SAMPLE_RENAMING else [], output: protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc.html"), protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt"), @@ -135,7 +135,7 @@ rule multiqc_by_sample_final_pass: OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats", OUT_DIR / "{sample}" / "qc" / "quac_watch" / "quac_watch_overall_summary.yaml", multiqc_config=MULTIQC_CONFIG_FILE, - rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv" if ALLOW_SAMPLE_RENAMING else [], + rename_config=lambda wildcards: SAMPLES_CONFIG[wildcards.sample]["multiqc_rename_config"] if ALLOW_SAMPLE_RENAMING else [], output: protected(OUT_DIR / "{sample}" / "qc" / "multiqc_final_pass" / "{sample}_multiqc.html"), protected(OUT_DIR / "{sample}" / "qc" / "multiqc_final_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt"), From 0cf0fb827672ec7679a177e8da369ddb767048b1 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:42:55 -0500 Subject: [PATCH 35/51] brings back rule aggregate_sample_rename_configs --- workflow/rules/aggregate_results.smk | 43 +++++++++++++--------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index 730bd3f..0409845 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -164,30 +164,27 @@ rule multiqc_by_sample_final_pass: ########################## Multi-sample QC aggregation ########################## -# rule aggregate_sample_rename_configs: -# input: -# expand( -# PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config" / "{sample}_rename_config.tsv", -# sample=SAMPLES, -# ), -# output: -# outfile=protected(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv"), -# tempfile=temp(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "flist.txt"), -# message: -# "Aggregate all sample rename-config files." -# singularity: -# "docker://quay.io/biocontainers/mulled-v2-78a02249d8cc4e85718933e89cf41d0e6686ac25:70df245247aac9844ee84a9da1e96322a24c1f34-0" -# shell: -# r""" -# # save files in a tempfile -# echo {input} \ -# | tr " " "\n" \ -# > {output.tempfile} +rule aggregate_sample_rename_configs: + input: + [SAMPLES_CONFIG[sample]["multiqc_rename_config"] for sample in SAMPLES_CONFIG] + output: + outfile=protected(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv"), + tempfile=temp(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "flist.txt"), + message: + "Aggregate all sample rename-config files." + singularity: + "docker://quay.io/biocontainers/mulled-v2-78a02249d8cc4e85718933e89cf41d0e6686ac25:70df245247aac9844ee84a9da1e96322a24c1f34-0" + shell: + r""" + # save files in a tempfile + echo {input} \ + | tr " " "\n" \ + > {output.tempfile} -# python src/aggregate_sample_rename_configs.py \ -# --infile {output.tempfile} \ -# --outfile {output.outfile} -# """ + python src/aggregate_sample_rename_configs.py \ + --infile {output.tempfile} \ + --outfile {output.outfile} + """ rule multiqc_aggregation_all_samples: From 538af0e031995287442f81d9553fdb493663fbcb Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:44:38 -0500 Subject: [PATCH 36/51] removes localrules --- workflow/rules/aggregate_results.smk | 3 --- 1 file changed, 3 deletions(-) diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index 0409845..51812cb 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -1,7 +1,4 @@ ########################## Create Multiqc config file ########################## -localrules: - create_multiqc_config, - rule create_multiqc_config: input: script=WORKFLOW_PATH / "src" / "quac_watch" / "create_mutliqc_configs.py", From 084b0e3659b3fe2865b7f0ba433377a488cbadea Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Thu, 5 Oct 2023 22:52:28 -0500 Subject: [PATCH 37/51] updates changelog --- docs/Changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/Changelog.md b/docs/Changelog.md index 1141c6c..5f646f2 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -14,6 +14,10 @@ YYYY-MM-DD John Doe 2023-10-05 Manavalan Gajapathy +* Refactors to accept sample filepaths via user-provided sample config file, when `--allow_sample_renaming` is used (#86) + +2023-10-05 Manavalan Gajapathy + * Refactors to accept sample filepaths via user-provided sample config file, when `--include_prior_qc` is used (#86) * Adds a test sample config file that includes priorQC filepaths From f9471b068718f52d04abbe48b27c0a97ff5d64d5 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 13:23:48 -0500 Subject: [PATCH 38/51] fixes bug arising when run without --allow_sample_renaming --- workflow/rules/aggregate_results.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index 51812cb..b682dc3 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -163,7 +163,7 @@ rule multiqc_by_sample_final_pass: ########################## Multi-sample QC aggregation ########################## rule aggregate_sample_rename_configs: input: - [SAMPLES_CONFIG[sample]["multiqc_rename_config"] for sample in SAMPLES_CONFIG] + [SAMPLES_CONFIG[sample]["multiqc_rename_config"] for sample in SAMPLES_CONFIG] if ALLOW_SAMPLE_RENAMING else [], output: outfile=protected(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "aggregated_rename_configs.tsv"), tempfile=temp(OUT_DIR / "project_level_qc" / "multiqc" / "configs" / "flist.txt"), From 7dd5d73be3036d6e2e0448b719dee6bfd8c68225 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:13:19 -0500 Subject: [PATCH 39/51] adds column description --- docs/input_output.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/input_output.md b/docs/input_output.md index 2ef89f1..1004535 100644 --- a/docs/input_output.md +++ b/docs/input_output.md @@ -2,6 +2,22 @@ ## Input +### Sample config file + +| Column | When to use | Description | +| --------------------- | ------------------------- | ----------------------------------------------------------------------------------------------------- | +| sample_id | Always | Sample identifier | +| bam | Always | BAM filepath | +| vcf | Always | VCF filepath | +| capture_bed | `--exome` | Capture region bed filepath | +| fastqc_raw | `--include_prior_qc` | Filepath to FastQC `zip` files created from raw fastqs. Use comma as delimiter if multiple files. | +| fastqc_trimmed | `--include_prior_qc` | Filepath to FastQC `zip` files created from trimmed fastqs. Use comma as delimiter if multiple files. | +| fastq_screen | `--include_prior_qc` | Filepath to FastQ Screen `txt` files. Use comma as delimiter if multiple files. | +| dedup | `--include_prior_qc` | Filepath to Picard's MarkDuplicates `txt` files. Use comma as delimiter if multiple files. | +| multiqc_rename_config | `--allow_sample_renaming` | Filepath to label rename configfile to use with multiqc | + +### Pedigree file + Samples belonging to a project are provided as input via `--pedigree` to QuaC in [pedigree file From e649368fce8ffc662b4a5b199e89326b045137cd Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:31:57 -0500 Subject: [PATCH 40/51] removes outdated section --- docs/input_output.md | 65 +------------------------------------------- 1 file changed, 1 insertion(+), 64 deletions(-) diff --git a/docs/input_output.md b/docs/input_output.md index 1004535..4187ecb 100644 --- a/docs/input_output.md +++ b/docs/input_output.md @@ -33,73 +33,10 @@ supplied in pedigree file will be processed by QuaC and all of these samples mus affected status info. See header of the script for usage instructions. -Each sample must have `BAM` and `VCF` files available in the directory structure shown below for sample `X`. - -``` -test_project/ -└── analysis - ├── X - │ ├── bam - │ │   ├── X.bam - │ │   └── X.bam.bai - │ └── vcf - │ ├── X.vcf.gz - │ └── X.vcf.gz.tbi - └── Y - └── .... -``` - -When run in exome mode using flag `--exome`, QuaC requires a capture-regions bed file at the path -`path_to_sample/configs/small_variant_caller/.bed` for each sample. - -``` -test_project/ -└── analysis - ├── X - │ ├── bam - │ │   ├── X.bam - │ │   └── X.bam.bai - │ ├── configs - │ │   └── small_variant_caller - │ │   └── capture_regions.bed - │ └── vcf - │ ├── X.vcf.gz - │ └── X.vcf.gz.tbi - └── Y - └── .... -``` - *Optionally*, QuaC can also utilize QC results produced by [certain tools](./index.md#optional-qc-output-consumed-by-quac) when run with flag `--include_prior_qc`. In this case, following directory structure is expected. -``` -test_project/ -└── analysis - ├── X - │ ├── bam - │ │   ├── X.bam - │ │   └── X.bam.bai - │ ├── qc - │ │   ├── dedup - │ │   │   ├── X-1.metrics.txt - │ │   │   └── X-2.metrics.txt - │ │   ├── fastqc-raw - │ │   │   ├── .... - │ │   ├── fastqc-trimmed - │ │   │   ├── .... - │ │   ├── fastq_screen-trimmed - │ │   │   └── .... - │ │   └── multiqc_initial_pass <--- needed only when `--allow_sample_renaming` flag is used - │ │   └── multiqc_sample_rename_config - │ │   └── X_rename_config.tsv - │ └── vcf - │ ├── X.vcf.gz - │ └── X.vcf.gz.tbi - └── Y - └── .... -``` - !!! note "CGDS users only" @@ -120,7 +57,7 @@ QuaC results are stored at the path specified via option `--outdir` (default: `data/quac/results/test_project/analysis`). Refer to the [system testing's output](./system_testing.md#expected-output-files) to learn more about the output directory structure. -!!! tip +!!! tip Users may primarily be interested in the aggregated QC results produced by [multiqc](https://multiqc.info/), both at sample-level as well as at the project-level. These multiqc reports also include summary of QuaC-Watch From d361c6403da5b9ecfb26d27a008c1d880aa88130 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:32:10 -0500 Subject: [PATCH 41/51] adds desc for sample configfile --- docs/input_output.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/input_output.md b/docs/input_output.md index 4187ecb..abe2d40 100644 --- a/docs/input_output.md +++ b/docs/input_output.md @@ -4,6 +4,10 @@ ### Sample config file +Sample identifier and their necessary filepaths (`bam`, `vcf`, etc.) are provided to QuaC in a `tsv` formatted config +file via `--sample_config`. Columns required depend on the flags supplied to `src/run_quac.py`. This table lists the +allowed columns and when to use them. + | Column | When to use | Description | | --------------------- | ------------------------- | ----------------------------------------------------------------------------------------------------- | | sample_id | Always | Sample identifier | @@ -16,6 +20,15 @@ | dedup | `--include_prior_qc` | Filepath to Picard's MarkDuplicates `txt` files. Use comma as delimiter if multiple files. | | multiqc_rename_config | `--allow_sample_renaming` | Filepath to label rename configfile to use with multiqc | +Refer to our system testing directory for example sample config files at `.test/configs`. For example: + +* `.test/configs/no_priorQC/sample_config/project_2samples_wgs.tsv` - Sample config file for WGS samples and no prior + QC. +* `.test/configs/no_priorQC/sample_config/project_2samples_exome.tsv` - Sample config file for exome samples and no + prior QC. Note that WGS and exome samples can't be used in the same config file. +* `.test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv` - Sample config file for WGS samples with prior + QC data available from [certain QC tools](./index.md#optional-qc-output-consumed-by-quac). + ### Pedigree file From cb9605a437fa3b69f0f5864f82a83e0278642bd7 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:34:23 -0500 Subject: [PATCH 42/51] removes outdated text --- docs/input_output.md | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/docs/input_output.md b/docs/input_output.md index abe2d40..679a7b7 100644 --- a/docs/input_output.md +++ b/docs/input_output.md @@ -45,30 +45,11 @@ supplied in pedigree file will be processed by QuaC and all of these samples mus create a dummy pedigree file, which will lack sex (unless project tracking sheet is provided), relatedness and affected status info. See header of the script for usage instructions. - -*Optionally*, QuaC can also utilize QC results produced by [certain -tools](./index.md#optional-qc-output-consumed-by-quac) when run with flag `--include_prior_qc`. In this case, following -directory structure is expected. - - -!!! note "CGDS users only" - - Output (bam, vcf and QC output) produced by CGDS's small variant caller pipeline can be readily used as input to - QuaC with flags `--include_prior_qc` and `--allow_sample_renaming`. - -### Example project structure - -Refer to system testing directory `.test/` in the repo for an example project to see an example project with above -mentioned directory structure needed as input. In this setup, projects A and B have prior QC data included, whereas -samples C and D do not have them. Refer to pedigree files under `.test/configs/` on how these example samples were used -as input to QuaC. - - ## Output QuaC results are stored at the path specified via option `--outdir` (default: `data/quac/results/test_project/analysis`). Refer to the [system testing's -output](./system_testing.md#expected-output-files) to learn more about the output directory structure. +output](./system_testing.md#expected-output-files) to learn more about the output directory structure. !!! tip @@ -80,4 +61,3 @@ output](./system_testing.md#expected-output-files) to learn more about the outpu QuaC's output directory structure was designed based on the output structure of the [CGDS small variant caller pipeline](https://gitlab.rc.uab.edu/center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline). - From b513dd3bb22a00fa1bbb60a1d5d416e7ec7f18ab Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:39:40 -0500 Subject: [PATCH 43/51] updates pedigree reqt --- docs/input_output.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/input_output.md b/docs/input_output.md index 679a7b7..19fe254 100644 --- a/docs/input_output.md +++ b/docs/input_output.md @@ -33,9 +33,9 @@ Refer to our system testing directory for example sample config files at `.test/ -Samples belonging to a project are provided as input via `--pedigree` to QuaC in [pedigree file -format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Only the samples that are -supplied in pedigree file will be processed by QuaC and all of these samples must belong to the same project. +QuaC requires a [pedigree +file](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format) as input via `--pedigree`. +Samples listed in this file must correspond to those in sample config file (`--sample_config`). From e4755eb94551271b9d16fa78aaf2a8ef4e7c36d0 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:46:49 -0500 Subject: [PATCH 44/51] expands output info --- docs/input_output.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/input_output.md b/docs/input_output.md index 19fe254..daf170f 100644 --- a/docs/input_output.md +++ b/docs/input_output.md @@ -48,12 +48,16 @@ Samples listed in this file must correspond to those in sample config file (`--s ## Output QuaC results are stored at the path specified via option `--outdir` (default: -`data/quac/results/test_project/analysis`). Refer to the [system testing's +`data/quac/results/test_project/analysis`). Refer to the [system testing's output](./system_testing.md#expected-output-files) to learn more about the output directory structure. +QC output are stored at the sample level as well as the project level (ie. all samples considered together) depending on +the type of QC run. For example, Qualimap tool is run at the sample level whereas Somalier tool is run at the project +level. MultiQC reports are available both at the sample and project level. + !!! tip - Users may primarily be interested in the aggregated QC results produced by [multiqc](https://multiqc.info/), + Users may primarily be interested in the aggregated QC results produced by [MultiQC](https://multiqc.info/), both at sample-level as well as at the project-level. These multiqc reports also include summary of QuaC-Watch results at the top. From 25cc54091e57fc3c1251ca53c80eda00f9e9f274 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:51:50 -0500 Subject: [PATCH 45/51] uses variable --- docs/system_testing.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/system_testing.md b/docs/system_testing.md index 1f7c1a1..e2550be 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -42,7 +42,7 @@ PRIOR_QC_STATUS="no_priorQC" # WGS mode python src/run_quac.py \ - --sample_config ".test/configs/no_priorQC/sample_config/project_2samples_wgs.tsv" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_wgs.tsv" \ --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ @@ -51,7 +51,7 @@ python src/run_quac.py \ # Exome mode python src/run_quac.py \ - --sample_config ".test/configs/no_priorQC/sample_config/project_2samples_exome.tsv" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_exome.tsv" \ --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_exome-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ @@ -66,7 +66,7 @@ PRIOR_QC_STATUS="include_priorQC" # WGS mode python src/run_quac.py \ - --sample_config ".test/configs/include_priorQC/sample_config/project_2samples_wgs.tsv" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_wgs.tsv" \ --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ @@ -77,7 +77,7 @@ python src/run_quac.py \ # Exome mode python src/run_quac.py \ - --sample_config ".test/configs/include_priorQC/sample_config/project_2samples_exome.tsv" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_exome.tsv" \ --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_exome-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ From 69f20b2145de61c1d953c39ac76017168e70038e Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:52:55 -0500 Subject: [PATCH 46/51] updates install test cmd --- docs/installation_configuration.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/installation_configuration.md b/docs/installation_configuration.md index 5665563..fc40bde 100644 --- a/docs/installation_configuration.md +++ b/docs/installation_configuration.md @@ -207,9 +207,8 @@ PROJECT_CONFIG="project_2samples" PRIOR_QC_STATUS="no_priorQC" python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_wgs.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --outdir "data/quac/results/test_${PROJECT_CONFIG}_wgs-${PRIOR_QC_STATUS}/analysis" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ --workflow_config "configs/workflow.yaml" \ From 05a349f8bd7a7902e1e3613418743284def4abb1 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 06:55:29 -0500 Subject: [PATCH 47/51] updates commands --- docs/visualize_pipeline.md | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/docs/visualize_pipeline.md b/docs/visualize_pipeline.md index d6b1395..dc9d657 100644 --- a/docs/visualize_pipeline.md +++ b/docs/visualize_pipeline.md @@ -24,9 +24,8 @@ DAG_DIR="pipeline_visualized" ###### WGS mode ###### # DAG python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_wgs.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ --include_prior_qc \ --extra_args "--dag -F | dot -Tpng > ${DAG_DIR}/wgs_dag.png" @@ -34,9 +33,8 @@ python src/run_quac.py \ # Rulegraph - less informative than DAG at sample level but less dense than DAG makes this easier to skim python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_wgs.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --quac_watch_config "configs/quac_watch/wgs_quac_watch_config.yaml" \ --include_prior_qc \ --extra_args "--rulegraph -F | dot -Tpng > ${DAG_DIR}/wgs_rulegraph.png" @@ -45,9 +43,8 @@ python src/run_quac.py \ ###### Exome mode ###### # DAG python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_exome.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --include_prior_qc \ --exome \ @@ -56,12 +53,10 @@ python src/run_quac.py \ # Rulegraph - less informative than DAG at sample level but less dense than DAG makes this easier to skim python src/run_quac.py \ - --project_name test_project \ - --projects_path ".test/ngs-data/" \ - --pedigree ".test/configs/${PRIOR_QC_STATUS}/${PROJECT_CONFIG}.ped" \ + --sample_config ".test/configs/${PRIOR_QC_STATUS}/sample_config/${PROJECT_CONFIG}_exome.tsv" \ + --pedigree ".test/configs/${PRIOR_QC_STATUS}/pedigree/${PROJECT_CONFIG}.ped" \ --quac_watch_config "configs/quac_watch/exome_quac_watch_config.yaml" \ --include_prior_qc \ --exome \ --extra_args "--rulegraph -F | dot -Tpng > ${DAG_DIR}/exome_rulegraph.png" - ``` From 5f2aeeff7aba1ab1082653c3b4b9011ebad7c4e1 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 12:38:53 -0500 Subject: [PATCH 48/51] adds doc on editing quac-watch configs --- docs/quac_watch.md | 49 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/docs/quac_watch.md b/docs/quac_watch.md index 4084727..c667197 100644 --- a/docs/quac_watch.md +++ b/docs/quac_watch.md @@ -3,8 +3,7 @@ QuaC includes a tool called QuaC-Watch, which consumes results from several QC tools, compares QC metrics against the acceptable thresholds, and summarizes results using color-coded `pass`/`fail` flags for efficient review. This summary allows users to quickly review output from multiple QC tools, identify whether samples meet expected quality thresholds, -and readily highlight samples that need further review. - +and readily highlight samples that need further review. ## Configs @@ -16,15 +15,53 @@ We provide pre-defined thresholds for QC metrics as part of the QuaC repo and th These thresholds were curated based on -* literature +* literature * in-house analyses using hundreds of GS and ES samples -* knowledge gained from our past sample QC experiences - +* knowledge gained from our past sample QC experiences !!! info QuaC is built to use with Human WGS/WES data. If you would like to use it with non-human data, please modify the pipeline as needed -- especially the thresholds used in QuaC-Watch configs. +### Editing configs + +If you would like to modify the thresholds, you may use the above-mentioned QuaC-Watch config files as template and then +choose the thresholds of your choice. Refer to the tool and QC metric of interest to identify reasonable values to use. + +Each QC metric in the config file, except FastQC, requires at least one of `min`and `max` values being specified. For +example, the snippet below shows thresholds for Qualimap's `mean_coverage` with minimum value of 30 and maximum value +not specified. + +```yaml +qualimap: + mean_coverage: + description: "Mean coverage" + min: 30 + max: +``` + +If you would like to specify maximum value, say 50, modify the snippet as follows: + +```yaml +qualimap: + mean_coverage: + description: "Mean coverage" + min: 30 + max: 50 +``` + +While you may modify the threshold values and description, modifying or removing field names or keys would result in +QuaC-Watch behaving in an unexpected manner. For example, the below snippet would fail as it is missing `max` field. + +```yaml +qualimap: + mean_coverage: + description: "Mean coverage" + min: 30 +``` + + + ## QuaC-Watch in MultiQC report QuaC aggregates QC results from multiple tools using MultiQC into a single stand-alone interactive HTML report. @@ -34,4 +71,4 @@ check for sample quality. Here is an example screenshot: ![QuaC-Watch report](./images/quac_watch_multiqc.png "QuaC-Watch report at the top of MultiQC report") Users may optionally toggle columns to view values for QC metrics of interest and hover over the column title to view -thresholds used by Quac-Watch (highlighted by red arrow). \ No newline at end of file +thresholds used by Quac-Watch (highlighted by red arrow). From fcb2420caa9d83cde6b7675bd4012c18682bba6f Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 12:51:35 -0500 Subject: [PATCH 49/51] updates changelog --- docs/Changelog.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/Changelog.md b/docs/Changelog.md index 5f646f2..dda9d9a 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -12,6 +12,12 @@ YYYY-MM-DD John Doe ``` --- +2023-10-06 Manavalan Gajapathy + +* Adds documentation on providing sample filepaths via user-provided sample config file due to recent PRs #87, #88, #89 + and #90 (closes #86). +* Adds documentation on editing thresholds in the QuaC-Watch config file (closes #85) + 2023-10-05 Manavalan Gajapathy * Refactors to accept sample filepaths via user-provided sample config file, when `--allow_sample_renaming` is used (#86) From cac1a977546ae0590d439dc54982e6fa41f27454 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Fri, 6 Oct 2023 20:44:00 -0500 Subject: [PATCH 50/51] removes outdated note --- docs/system_testing.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/system_testing.md b/docs/system_testing.md index e2550be..b1f9394 100644 --- a/docs/system_testing.md +++ b/docs/system_testing.md @@ -88,10 +88,6 @@ python src/run_quac.py \ $USE_SLURM ``` -!!! note - - Use `PROJECT="project_1sample"` to test out a project with only one sample. - ## Expected output files Output directory structure for WGS + `include_prior_qc` mode would look like this. From 3a38e46f55f8753b3cc9c8d40075851dad979088 Mon Sep 17 00:00:00 2001 From: Manavalan Gajapathy Date: Mon, 9 Oct 2023 09:49:39 -0500 Subject: [PATCH 51/51] updates changelog --- docs/Changelog.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/Changelog.md b/docs/Changelog.md index dda9d9a..1f8cde6 100644 --- a/docs/Changelog.md +++ b/docs/Changelog.md @@ -12,6 +12,10 @@ YYYY-MM-DD John Doe ``` --- +2023-10-09 Manavalan Gajapathy + +* Merges `joss_manuscript` to the `master` branch to bring it up to date. + 2023-10-06 Manavalan Gajapathy * Adds documentation on providing sample filepaths via user-provided sample config file due to recent PRs #87, #88, #89