diff --git a/Changelog.md b/Changelog.md index 3d0fccc..2b16286 100644 --- a/Changelog.md +++ b/Changelog.md @@ -35,3 +35,8 @@ YYYY-MM-DD John Doe * Bugfix: Fixes error when there is only one sample in input ped file (#34) * Adds system-testing for such only-one-sample-in-input setup (#35). + +2022-04-07 Manavalan Gajapathy + +* Previously hardcoded hardware resources for snakemake rules can now be supplied via `configs/workflow.yaml` (closes #48) +* Modified multiqc conda env config to use explicit dependencies to get around installation issues (closes #47) \ No newline at end of file diff --git a/README.md b/README.md index 67047b5..81a5c59 100644 --- a/README.md +++ b/README.md @@ -185,18 +185,24 @@ snakemake rules. ### Set up workflow config file -QuaC requires a workflow config file in yaml format (`configs/workflow.yaml`), which provides filepaths to necessary -dependencies required by certain QC tools. Their format should look like: +QuaC requires a workflow config file in yaml format ([`configs/workflow.yaml`](./configs/workflow.yaml)), which provides filepaths to necessary +dataset dependencies required by certain QC tools. In addition, hardware resources can be configured (refer to [`configs/workflow.yaml`](./configs/workflow.y) for more info). File format should look like: ```yaml -ref: "path to ref genome path" -somalier: - sites: "path to somalier's site file" - labels_1kg: "path to somalier's ancestry-labels-1kg file" - somalier_1kg: "dirpath to somalier's 1kg-somalier files" -verifyBamID: - svd_dat_wgs: "path to WGS resources .dat files" - svd_dat_exome: "path to exome resources .dat files" +datasets: + ref: "path to ref genome path" + somalier: + sites: "path to somalier's site file" + labels_1kg: "path to somalier's ancestry-labels-1kg file" + somalier_1kg: "dirpath to somalier's 1kg-somalier files" + verifyBamID: + svd_dat_wgs: "path to WGS resources .dat files" + svd_dat_exome: "path to exome resources .dat files" + +#### hardware resources #### +resources: + ... + ... ``` #### Prepare verifybamid datasets for exome analysis diff --git a/configs/cluster_config.json b/configs/cluster_config.json index 0232a67..8127f6b 100644 --- a/configs/cluster_config.json +++ b/configs/cluster_config.json @@ -20,4 +20,4 @@ "multiqc_aggregation_all_samples": { "mem-per-cpu": "24G" } -} +} \ No newline at end of file diff --git a/configs/env/multiqc.yaml b/configs/env/multiqc.yaml index 321e793..92cfadf 100644 --- a/configs/env/multiqc.yaml +++ b/configs/env/multiqc.yaml @@ -1,7 +1,84 @@ channels: - - conda-forge - - anaconda - bioconda + - conda-forge + - defaults dependencies: - - python =3.6 - - multiqc=1.9 + - python=3.6.13 + - multiqc==1.9 + - networkx=2.5 + - numpy=1.19.5 + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - brotlipy=0.7.0 + - ca-certificates=2021.5.30 + - certifi=2021.5.30 + - cffi=1.14.6 + - chardet=4.0.0 + - charset-normalizer=2.0.0 + - click=8.0.1 + - coloredlogs=15.0.1 + - colormath=3.0.0 + - cryptography=3.4.7 + - cycler=0.10.0 + - decorator=5.0.9 + - freetype=2.10.4 + - future=0.18.2 + - humanfriendly=9.2 + - idna=3.1 + - importlib-metadata=4.6.3 + - jbig=2.1 + - jinja2=3.0.1 + - jpeg=9d + - kiwisolver=1.3.1 + - lcms2=2.12 + - ld_impl_linux-64=2.36.1 + - lerc=2.2.1 + - libblas=3.9.0 + - libcblas=3.9.0 + - libdeflate=1.7 + - libffi=3.3 + - libgcc-ng=11.1.0 + - libgfortran-ng=11.1.0 + - libgfortran5=11.1.0 + - libgomp=11.1.0 + - liblapack=3.9.0 + - libopenblas=0.3.17 + - libpng=1.6.37 + - libstdcxx-ng=11.1.0 + - libtiff=4.3.0 + - libwebp-base=1.2.0 + - lz4-c=1.9.3 + - lzstring=1.0.4 + - markdown=3.3.4 + - markupsafe=2.0.1 + - matplotlib-base=3.3.4 + - ncurses=6.2 + - olefile=0.46 + - openjpeg=2.4.0 + - openssl=1.1.1k + - pillow=8.3.1 + - pip=21.2.3 + - pycparser=2.20 + - pyopenssl=20.0.1 + - pyparsing=2.4.7 + - pysocks=1.7.1 + - python-dateutil=2.8.2 + - python_abi=3.6 + - pyyaml=5.4.1 + - readline=8.1 + - requests=2.26.0 + - setuptools=49.6.0 + - simplejson=3.8.1 + - six=1.16.0 + - spectra=0.0.11 + - sqlite=3.36.0 + - tk=8.6.10 + - tornado=6.1 + - typing_extensions=3.10.0.0 + - urllib3=1.26.6 + - wheel=0.37.0 + - xz=5.2.5 + - yaml=0.2.5 + - zipp=3.5.0 + - zlib=1.2.11 + - zstd=1.5.0 diff --git a/configs/workflow.yaml b/configs/workflow.yaml index ff9f4f5..7fb7e5e 100644 --- a/configs/workflow.yaml +++ b/configs/workflow.yaml @@ -1,8 +1,19 @@ -ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" -somalier: - sites: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/sites/sites.hg38.vcf.gz" - labels_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/ancestry-labels-1kg.tsv" - somalier_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/1kg-somalier/" -verifyBamID: - svd_dat_wgs: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/wgs/1000g.phase3.100k.b38.vcf.gz.dat" - svd_dat_exome: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/exome/chr_added/1000g.phase3.10k.b38.exome.vcf.gz.dat" +datasets: + ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" + somalier: + sites: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/sites/sites.hg38.vcf.gz" + labels_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/ancestry-labels-1kg.tsv" + somalier_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/1kg-somalier/" + verifyBamID: + svd_dat_wgs: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/wgs/1000g.phase3.100k.b38.vcf.gz.dat" + svd_dat_exome: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/exome/chr_added/1000g.phase3.10k.b38.exome.vcf.gz.dat" + +#### hardware resources #### +resources: + qualimap_bamqc: + no_cpu: 2 + mem_per_cpu: "24G" + mosdepth_coverage: + no_cpu: 4 + verifybamid: + no_cpu: 4 diff --git a/src/run_quac.py b/src/run_quac.py index 9ed6646..f76c4ef 100755 --- a/src/run_quac.py +++ b/src/run_quac.py @@ -43,17 +43,17 @@ def read_workflow_config(workflow_config_fpath): data = yaml.safe_load(fh) mount_paths = set() - + datasets = data["datasets"] # ref genome - mount_paths.add(Path(data["ref"]).parent) + mount_paths.add(Path(datasets["ref"]).parent) # somalier resource files - for resource in data["somalier"]: - mount_paths.add(Path(data["somalier"][resource]).parent) + for resource in datasets["somalier"]: + mount_paths.add(Path(datasets["somalier"][resource]).parent) # verifyBamID resource files - for resource in data["verifyBamID"]: - mount_paths.add(Path(data["verifyBamID"][resource]).parent) + for resource in datasets["verifyBamID"]: + mount_paths.add(Path(datasets["verifyBamID"][resource]).parent) return mount_paths diff --git a/workflow/rules/aggregate_results.smk b/workflow/rules/aggregate_results.smk index c6971bc..5dac20e 100644 --- a/workflow/rules/aggregate_results.smk +++ b/workflow/rules/aggregate_results.smk @@ -50,6 +50,9 @@ rule multiqc_by_sample_initial_pass: # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names, # using custom rename config file extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}", + conda: + ### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ### + str(WORKFLOW_PATH / "configs/env/multiqc.yaml") wrapper: "0.64.0/bio/multiqc" @@ -133,10 +136,14 @@ rule multiqc_by_sample_final_pass: # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names, # using custom rename config file extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}", + conda: + ### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ### + str(WORKFLOW_PATH / "configs/env/multiqc.yaml") wrapper: "0.64.0/bio/multiqc" + ########################## Multi-sample QC aggregation ########################## localrules: aggregate_sample_rename_configs, @@ -192,5 +199,8 @@ rule multiqc_aggregation_all_samples: --sample-names {input.rename_config} \ --cl_config "max_table_rows: 2000"' ), + conda: + ### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ### + str(WORKFLOW_PATH / "configs/env/multiqc.yaml") wrapper: "0.64.0/bio/multiqc" diff --git a/workflow/rules/coverage_analysis.smk b/workflow/rules/coverage_analysis.smk index e8ff619..1b00f77 100644 --- a/workflow/rules/coverage_analysis.smk +++ b/workflow/rules/coverage_analysis.smk @@ -24,11 +24,11 @@ rule qualimap_bamqc: "stats bam using qualimap. Sample: {wildcards.sample}" conda: str(WORKFLOW_PATH / "configs/env/qualimap.yaml") - threads: 2 + threads: config["resources"]["qualimap_bamqc"]["no_cpu"] params: outdir=lambda wildcards, output: str(Path(output["html_report"]).parent), capture_bed=lambda wildcards, input: f"--feature-file {input.target_regions}" if input.target_regions else "", - java_mem="24G", + java_mem=config["resources"]["qualimap_bamqc"]["mem_per_cpu"], shell: r""" unset DISPLAY @@ -49,7 +49,7 @@ rule picard_collect_multiple_metrics: input: bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", - ref=config["ref"], + ref=config["datasets"]["ref"], output: multiext( str(OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}"), @@ -68,7 +68,7 @@ rule picard_collect_wgs_metrics: input: bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", - ref=config["ref"], + ref=config["datasets"]["ref"], output: OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.collect_wgs_metrics", message: @@ -97,7 +97,7 @@ rule mosdepth_coverage: "Running mosdepth for coverage. Sample: {wildcards.sample}" conda: str(WORKFLOW_PATH / "configs/env/mosdepth.yaml") - threads: 4 + threads: config["resources"]["mosdepth_coverage"]["no_cpu"] params: out_prefix=lambda wildcards, output: output["summary"].replace(".mosdepth.summary.txt", ""), capture_bed=lambda wildcards, input: f"--by {input.target_regions}" if input.target_regions else "", diff --git a/workflow/rules/relatedness_ancestry.smk b/workflow/rules/relatedness_ancestry.smk index 5bacedf..a1db793 100644 --- a/workflow/rules/relatedness_ancestry.smk +++ b/workflow/rules/relatedness_ancestry.smk @@ -2,8 +2,8 @@ rule somalier_extract: input: bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", - sites=config["somalier"]["sites"], - ref_genome=config["ref"], + sites=config["datasets"]["somalier"]["sites"], + ref_genome=config["datasets"]["ref"], output: protected(OUT_DIR / "project_level_qc" / "somalier" / "extract" / "{sample}.somalier"), message: @@ -55,8 +55,8 @@ rule somalier_relate: rule somalier_ancestry: input: extracted=expand(OUT_DIR / "project_level_qc" / "somalier" / "extract" / "{sample}.somalier", sample=SAMPLES), - labels_1kg=config["somalier"]["labels_1kg"], - somalier_1kg_directory=config["somalier"]["somalier_1kg"], + labels_1kg=config["datasets"]["somalier"]["labels_1kg"], + somalier_1kg_directory=config["datasets"]["somalier"]["somalier_1kg"], output: out=protected( expand( diff --git a/workflow/rules/within_species_contamintation.smk b/workflow/rules/within_species_contamintation.smk index 3c208b6..80b425e 100644 --- a/workflow/rules/within_species_contamintation.smk +++ b/workflow/rules/within_species_contamintation.smk @@ -1,15 +1,15 @@ def get_svd(wildcards): if EXOME_MODE: - return expand(f"{config['verifyBamID']['svd_dat_exome']}.{{ext}}", ext=["bed", "mu", "UD"]) + return expand(f"{config['datasets']['verifyBamID']['svd_dat_exome']}.{{ext}}", ext=["bed", "mu", "UD"]) else: - return expand(f"{config['verifyBamID']['svd_dat_wgs']}.{{ext}}", ext=["bed", "mu", "UD"]) + return expand(f"{config['datasets']['verifyBamID']['svd_dat_wgs']}.{{ext}}", ext=["bed", "mu", "UD"]) rule verifybamid: input: bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam", bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai", - ref_genome=config["ref"], + ref_genome=config["datasets"]["ref"], svd=get_svd, output: ancestry=protected(OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry"), @@ -22,7 +22,7 @@ rule verifybamid: svd_prefix=lambda wildcards, input: input["svd"][0].replace(Path(input["svd"][0]).suffix, ""), out_prefix=lambda wildcards, output: output["ancestry"].replace(".Ancestry", ""), sanity_check="--DisableSanityCheck" if is_testing_mode() else "", - threads: 4 + threads: config["resources"]["verifybamid"]["no_cpu"] shell: r""" verifybamid2 {params.sanity_check} \