Clinical-Genomics · khurrammaqbool · Nov 19, 2024 · May 7, 2024 · May 13, 2024 · May 30, 2024
@@ -11,4 +11,4 @@ jobs:
       - uses: psf/black@stable
         with:
           options: "--check --verbose"
-          version: "22.3.0"
+          version: "23.7.0"
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure]
+        container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, multiqc, msisensorpro, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure]
     steps:
       - name: Git checkout
         id: git_checkout

@@ -11,7 +11,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure]
+        container-name: [align_qc, annotate, ascatNgs, cadd, cnvkit, cnvpytor, coverage_qc, delly, gatk, htslib, msisensorpro, multiqc, purecn, somalier, varcall_py3, varcall_py27, vcf2cytosure]
     steps:
       - name: Git checkout
         id: git_checkout

@@ -13,7 +13,7 @@ jobs:
         id: git_checkout
         uses: actions/checkout@v3
       - name: Link Checker
-        uses: lycheeverse/lychee-action@v1.8.0
+        uses: lycheeverse/lychee-action@v2.0.2
         with:
-          args: --verbose './BALSAMIC/constants/cache.py' './docs/*.rst'
+          args: --max-redirects 10 --verbose './BALSAMIC/constants/cache.py' './docs/*.rst'
           fail: true
@@ -0,0 +1,37 @@
+import click
+import pysam
+import numpy as np
+
+
+@click.command()
+@click.argument("input_bam", type=click.Path(exists=True))
+@click.argument("output_bam", type=click.Path())
+@click.option(
+    "--max-quality",
+    default=70,
+    type=int,
+    help="Maximum quality value to cap to.",
+)
+def cap_base_qualities(input_bam: str, output_bam: str, max_quality: int):
+    """
+    Cap the base qualities in a BAM file.
+
+    Args:
+        input_bam (str): Input BAM file path.
+        output_bam (str): Output BAM file path.
+        max_quality (int): Maximum quality value to cap to.
+    """
+    # Open input BAM file for reading
+    samfile = pysam.AlignmentFile(input_bam, "rb")
+    out_bam = pysam.AlignmentFile(output_bam, "wb", header=samfile.header)
+    for read in samfile.fetch():
+        qualities = np.array(read.query_qualities)
+        capped_qualities = np.minimum(qualities, max_quality)
+        # Update the base qualities in the read
+        read.query_qualities = capped_qualities.tolist()
+        # Write the modified read to the output BAM file
+        out_bam.write(read)
+
+
+if __name__ == "__main__":
+    cap_base_qualities()
@@ -144,7 +144,14 @@ def get_qc_supported_capture_kit(capture_kit, metrics: List[str]) -> str:
         if k != "default":
             available_panel_beds.append(k)
 
-    return next((i for i in available_panel_beds if i in capture_kit), None)
+    return next(
+        (
+            i
+            for i in available_panel_beds
+            if re.search(rf"{re.escape(i)}(?=_\d)", capture_kit)
+        ),
+        None,
+    )
 
 
 def get_requested_metrics(config: dict, metrics: dict) -> dict:
@@ -186,41 +193,53 @@ def get_metric_condition(
     return req_metrics
 
 
+def get_sample_id(multiqc_key: str) -> str:
+    """Returns extracted sample ID from MultiQC data JSON key.
+
+    Example of possible sample-formats below from "report_saved_raw_data":
+    tumor.ACCXXXXXX
+    tumor.ACCXXXXXX_FR
+    ACCXXXXXX_align_sort_HMYLNDSXX_ACCXXXXXX_S165_L001
+
+    Returns
+        str: The extracted sample ID with the ACCXXXXXX format.
+    """
+    if "_align_sort_" in multiqc_key:
+        return multiqc_key.split("_")[0]
+    return multiqc_key.split(".")[1].split("_")[0]
+
+
 def get_multiqc_metrics(config: dict, multiqc_data: dict) -> list:
     """Extracts and returns the requested metrics from a multiqc JSON file"""
 
     requested_metrics = get_requested_metrics(config, METRICS)
 
-    def extract(data, output_metrics, sample=None, source=None):
+    def extract(data, output_metrics, multiqc_key=None, source=None):
         """Recursively fetch metrics data from a nested multiqc JSON"""
 
         if isinstance(data, dict):
             for k in data:
                 # Ignore UMI and reverse reads metrics
                 if "umi" not in k:
                     if k in requested_metrics:
-                        # example of possible sample-formats below from "report_saved_raw_data":
-                        # tumor.ACCXXXXXX
-                        # tumor.ACCXXXXXX_FR
-                        # extracted below for id to: ACCXXXXXX
                         output_metrics.append(
                             Metric(
-                                id=sample.split(".")[1].split("_")[0],
+                                id=get_sample_id(multiqc_key),
                                 input=get_multiqc_data_source(
-                                    multiqc_data, sample, source
+                                    multiqc_data, multiqc_key, source
                                 ),
                                 name=k,
                                 step=source,
                                 value=data[k],
                                 condition=get_metric_condition(
                                     config,
                                     requested_metrics,
-                                    sample.split(".")[1].split("_")[0],
+                                    get_sample_id(multiqc_key),
                                     k,
                                 ),
                             ).model_dump()
                         )
-                    extract(data[k], output_metrics, k, sample)
+                    extract(data[k], output_metrics, k, multiqc_key)
 
         return output_metrics
 

@@ -0,0 +1,44 @@
+import click
+
+
+@click.command()
+@click.argument("input_bedfile", type=click.Path(exists=True))
+@click.argument("output_bedfile", type=click.Path())
+@click.option(
+    "--extend-to-min-region-size",
+    default=100,
+    help="Will extend regions shorter than the specified size to this minimum size.",
+)
+def extend_bedfile(
+    input_bedfile: str, output_bedfile: str, extend_to_min_region_size: int
+):
+    """
+    Process a BED file to ensure regions are at least a minimum size.
+
+    Args:
+        input_bedfile (str): Input BED file path.
+        output_bedfile (str): Output BED file path.
+        min_region_size (int): Minimum region size to enforce.
+    """
+    with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
+        for line in infile:
+            fields = line.strip().split("\t")
+
+            chrom: str = fields[0]
+            start = int(fields[1])
+            end = int(fields[2])
+
+            region_length: int = end - start
+            if region_length < extend_to_min_region_size:
+                center = (start + end) // 2
+                half_size = extend_to_min_region_size // 2
+                start = max(0, center - half_size)
+                end = center + half_size
+                if extend_to_min_region_size % 2 != 0:
+                    end += 1
+
+            outfile.write(f"{chrom}\t{start}\t{end}\n")
+
+
+if __name__ == "__main__":
+    extend_bedfile()
@@ -0,0 +1,78 @@
+"""Script to submit jobs to a cluster."""
+import shutil
+from typing import Any, Dict, List, Optional
+
+import click
+from snakemake import utils
+
+from BALSAMIC.commands.options import (
+    OPTION_BENCHMARK,
+    OPTION_CLUSTER_ACCOUNT,
+    OPTION_CLUSTER_MAIL,
+    OPTION_CLUSTER_MAIL_TYPE,
+    OPTION_CLUSTER_PROFILE,
+    OPTION_CLUSTER_QOS,
+)
+from BALSAMIC.constants.cluster import QOS, ClusterProfile
+from BALSAMIC.models.scheduler import Scheduler
+
+
+@click.command()
+@click.argument("case_id", nargs=1, required=True, type=click.STRING)
+@click.argument("dependencies", nargs=-1, type=click.STRING)
+@click.argument("job_script", nargs=1, type=click.Path(exists=True, resolve_path=True))
+@OPTION_CLUSTER_ACCOUNT
+@OPTION_BENCHMARK
+@OPTION_CLUSTER_MAIL_TYPE
+@OPTION_CLUSTER_MAIL
+@OPTION_CLUSTER_PROFILE
+@OPTION_CLUSTER_QOS
+@click.option(
+    "--log-dir",
+    type=click.Path(exists=True, resolve_path=True),
+    required=True,
+    help="Logging directory path",
+)
+@click.option(
+    "--script-dir",
+    type=click.Path(exists=True, resolve_path=True),
+    required=True,
+    help="Script directory path",
+)
+def immediate_submit(
+    account: str,
+    case_id: str,
+    job_script: str,
+    log_dir: str,
+    profile: ClusterProfile,
+    script_dir: str,
+    benchmark: Optional[bool] = False,
+    dependencies: Optional[List[str]] = None,
+    mail_type: Optional[str] = None,
+    mail_user: Optional[str] = None,
+    qos: Optional[QOS] = QOS.LOW,
+) -> None:
+    """
+    Submits jobs to the cluster. Each job is submitted sequentially, and their respective job IDs are collected
+    from the output. These job IDs are then forwarded as dependencies to the subsequent jobs.
+    """
+    job_script: str = shutil.copy2(src=job_script, dst=script_dir)
+    job_properties: Dict[str, Any] = utils.read_job_properties(job_script)
+    scheduler: Scheduler = Scheduler(
+        account=account,
+        benchmark=benchmark,
+        case_id=case_id,
+        dependencies=dependencies,
+        job_properties=job_properties,
+        job_script=job_script,
+        log_dir=log_dir,
+        mail_type=mail_type,
+        mail_user=mail_user,
+        profile=profile,
+        qos=qos,
+    )
+    scheduler.submit_job()
+
+
+if __name__ == "__main__":
+    immediate_submit()
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+import vcfpy
+import click
+import sys
+import logging
+from typing import List, Optional
+
+LOG = logging.getLogger(__name__)
+
+
+def summarize_ad_to_dp(ad_list):
+    """
+    Summarizes the AD (allelic depth) field into total DP (read depth).
+
+    Parameters:
+    ad_list (list): List of read depths supporting each allele, [ref_depth, alt1_depth, alt2_depth, ...]
+
+    Returns:
+    int: Total read depth (DP) across all alleles.
+    """
+    if ad_list is None:
+        return 0  # Return 0 if AD field is not present
+    return sum(ad_list)
+
+
+@click.command()
+@click.argument("input_vcf", type=click.Path(exists=True))
+@click.argument("output_vcf", type=click.Path())
+def process_vcf(input_vcf: str, output_vcf: str):
+    """
+    Processes the input VCF file and writes the updated information to the output VCF file.
+
+    INPUT_VCF: Path to the input VCF file.
+    OUTPUT_VCF: Path to the output VCF file.
+    """
+
+    # Open the input VCF file
+    reader: vcfpy.Reader = vcfpy.Reader.from_path(input_vcf)
+
+    # Ensure the sample name is 'TUMOR'
+    sample_name: str = reader.header.samples.names[0]
+    if sample_name != "TUMOR":
+        LOG.warning(
+            f"Error: The first sample is named '{sample_name}', but 'TUMOR' is expected."
+        )
+        sys.exit(1)
+
+    # Add AF and DP fields to the header if not already present
+    if "AF" not in reader.header.info_ids():
+        reader.header.add_info_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "AF"),
+                    ("Number", "A"),
+                    ("Type", "Float"),
+                    ("Description", "Allele Frequency"),
+                ]
+            )
+        )
+
+    if "DP" not in reader.header.info_ids():
+        reader.header.add_info_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "DP"),
+                    ("Number", "1"),
+                    ("Type", "Integer"),
+                    ("Description", "Total Depth"),
+                ]
+            )
+        )
+
+    # Open the output VCF file for writing
+    with vcfpy.Writer.from_path(output_vcf, reader.header) as writer:
+        # Loop through each record (variant)
+        for record in reader:
+            # Get the TUMOR sample data
+            sample_index: int = reader.header.samples.names.index(sample_name)
+            tumor_call: vcfpy.Call = record.calls[sample_index]
+
+            # Check and process AD field
+            tumor_ad: Optional[List[int]] = tumor_call.data.get(
+                "AD", None
+            )  # AD is a list [ref_count, alt_count]
+            if tumor_ad is None:
+                LOG.warning(
+                    f"Warning: AD field is missing for record at position {record.POS} on {record.CHROM}"
+                )
+            else:
+                record.INFO["DP"] = summarize_ad_to_dp(tumor_ad)
+
+            # Check and process AF field
+            tumor_af: Optional[float] = tumor_call.data.get("AF", None)
+            if tumor_af is None:
+                LOG.warning(
+                    f"Warning: AF field is missing for record at position {record.POS} on {record.CHROM}"
+                )
+                record.INFO["AF"] = [0.0]  # Default AF to 0.0 if missing
+            else:
+                record.INFO["AF"] = [tumor_af]  # Wrap AF in a list
+
+            # Write the updated record to the output VCF file
+            writer.write_record(record)
+
+    click.echo(f"VCF file processed and saved to {output_vcf}")
+
+
+if __name__ == "__main__":
+    process_vcf()