Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add msi analysis for TO workflow #1463

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions BALSAMIC/commands/config/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
OPTION_GENOME_VERSION,
OPTION_GENS_COV_PON,
OPTION_GNOMAD_AF5,
OPTION_MSI_PON,
OPTION_NORMAL_SAMPLE_NAME,
OPTION_PANEL_BED,
OPTION_PON_CNN,
Expand Down Expand Up @@ -86,6 +87,7 @@
@OPTION_GENOME_INTERVAL
@OPTION_GENS_COV_PON
@OPTION_GNOMAD_AF5
@OPTION_MSI_PON
@OPTION_NORMAL_SAMPLE_NAME
@OPTION_PANEL_BED
@OPTION_PON_CNN
Expand All @@ -99,166 +101,171 @@
@OPTION_UMI_TRIM_LENGTH
@OPTION_UMI_MIN_READS
@click.pass_context
def case_config(
context: click.Context,
adapter_trim: bool,
analysis_dir: Path,
analysis_workflow: AnalysisWorkflow,
background_variants: Path,
balsamic_cache: Path,
cache_version: str,
cadd_annotations: Path,
cancer_germline_snv_observations: Path,
cancer_somatic_snv_observations: Path,
cancer_somatic_sv_observations: Path,
case_id: str,
clinical_snv_observations: Path,
clinical_sv_observations: Path,
exome: bool,
fastq_path: Path,
gender: Gender,
genome_version: GenomeVersion,
genome_interval: Path,
gens_coverage_pon: Path,
gnomad_min_af5: Path,
msi_pon: Path,
normal_sample_name: str,
panel_bed: Path,
pon_cnn: Path,
quality_trim: bool,
sentieon_install_dir: Path,
sentieon_license: str,
swegen_snv: Path,
swegen_sv: Path,
tumor_sample_name: str,
umi: bool,
umi_trim_length: int,
umi_min_reads: str | None,
):
references_path: Path = Path(balsamic_cache, cache_version, genome_version)
references: Dict[str, Path] = get_absolute_paths_dict(
base_path=references_path,
data=read_json(Path(references_path, f"reference.{FileType.JSON}").as_posix()),
)
cadd_annotations_path = {"cadd_annotations": cadd_annotations}
if cadd_annotations:
references.update(cadd_annotations_path)

if any([genome_interval, gens_coverage_pon, gnomad_min_af5]):
if panel_bed:
raise click.BadParameter(
"GENS is currently not compatible with TGA analysis, only WGS."
)
if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]):
raise click.BadParameter(
"All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS."
)

gens_ref_files = {
"genome_interval": genome_interval,
"gens_coverage_pon": gens_coverage_pon,
"gnomad_min_af5": gnomad_min_af5,
}

references.update(
{
gens_file: path
for gens_file, path in gens_ref_files.items()
if path is not None
}
)

msi_pon_path = {"msi_pon": msi_pon}
if msi_pon:
references.update(msi_pon_path)
khurrammaqbool marked this conversation as resolved.
Show resolved Hide resolved

variants_observations = {
"clinical_snv_observations": clinical_snv_observations,
"clinical_sv_observations": clinical_sv_observations,
"cancer_germline_snv_observations": cancer_germline_snv_observations,
"cancer_somatic_snv_observations": cancer_somatic_snv_observations,
"cancer_somatic_sv_observations": cancer_somatic_sv_observations,
"swegen_snv_frequency": swegen_snv,
"swegen_sv_frequency": swegen_sv,
}
references.update(
{
observations: path
for observations, path in variants_observations.items()
if path is not None
}
)

analysis_fastq_dir: str = get_analysis_fastq_files_directory(
case_dir=Path(analysis_dir, case_id).as_posix(), fastq_path=fastq_path
)
result_dir: Path = Path(analysis_dir, case_id, "analysis")
log_dir: Path = Path(analysis_dir, case_id, "logs")
script_dir: Path = Path(analysis_dir, case_id, "scripts")
benchmark_dir: Path = Path(analysis_dir, case_id, "benchmarks")
dag_path: Path = Path(
analysis_dir, case_id, f"{case_id}_BALSAMIC_{balsamic_version}_graph.pdf"
)
for directory in [result_dir, log_dir, script_dir, benchmark_dir]:
directory.mkdir(exist_ok=True)

config_collection_dict = ConfigModel(
sentieon={
"sentieon_install_dir": sentieon_install_dir,
"sentieon_license": sentieon_license,
"sentieon_exec": Path(sentieon_install_dir, "bin", "sentieon").as_posix(),
"dnascope_model": SENTIEON_DNASCOPE_MODEL.as_posix(),
"tnscope_model": SENTIEON_TNSCOPE_MODEL.as_posix(),
},
QC={
"quality_trim": quality_trim,
"adapter_trim": adapter_trim,
"umi_trim": umi if panel_bed else False,
"umi_trim_length": umi_trim_length,
},
analysis={
"case_id": case_id,
"gender": gender,
"analysis_dir": analysis_dir,
"fastq_path": analysis_fastq_dir,
"analysis_type": "paired" if normal_sample_name else "single",
"log": log_dir.as_posix(),
"script": script_dir.as_posix(),
"result": result_dir.as_posix(),
"benchmark": benchmark_dir.as_posix(),
"dag": dag_path.as_posix(),
"sequencing_type": "targeted" if panel_bed else "wgs",
"analysis_workflow": analysis_workflow,
"config_creation_date": datetime.now().strftime("%Y-%m-%d %H:%M"),
},
custom_filters={"umi_min_reads": umi_min_reads if umi_min_reads else None},
reference=references,
singularity={
"image": Path(balsamic_cache, cache_version, "containers").as_posix()
},
background_variants=background_variants,
samples=get_sample_list(
tumor_sample_name=tumor_sample_name,
normal_sample_name=normal_sample_name,
fastq_path=analysis_fastq_dir,
),
vcf=VCF_DICT,
bioinfo_tools=BIOINFO_TOOL_ENV,
bioinfo_tools_version=get_bioinfo_tools_version(
bioinfo_tools=BIOINFO_TOOL_ENV,
container_conda_env_path=CONTAINERS_DIR,
),
panel=(
{
"exome": exome,
"capture_kit": panel_bed,
"chrom": get_panel_chrom(panel_bed),
"pon_cnn": pon_cnn,
}
if panel_bed
else None
),
).model_dump(by_alias=True, exclude_none=True)
LOG.info("Balsamic config model instantiated successfully")

config_path = Path(analysis_dir, case_id, case_id + ".json").as_posix()
write_json(json_obj=config_collection_dict, path=config_path)
LOG.info(f"Config file saved successfully - {config_path}")

generate_graph(config_collection_dict, config_path)
LOG.info(f"BALSAMIC Workflow has been configured successfully!")

Check notice on line 271 in BALSAMIC/commands/config/case.py

View check run for this annotation

codefactor.io / CodeFactor

BALSAMIC/commands/config/case.py#L104-L271

Complex Method
8 changes: 8 additions & 0 deletions BALSAMIC/commands/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,14 @@
show_default=True,
)

OPTION_MSI_PON = click.option(
"--msi-pon",
type=click.Path(exists=True, resolve_path=True),
required=False,
help="Path of MSI PON baseline file",
)
ivadym marked this conversation as resolved.
Show resolved Hide resolved


OPTION_NORMAL_SAMPLE_NAME = click.option(
"--normal-sample-name",
required=False,
Expand Down
4 changes: 4 additions & 0 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -423,5 +423,9 @@
"msisensorpro_msi_tumor_normal": {
"time": "08:00:00",
"n": 24
},
"msisensorpro_msi_tumor_only": {
khurrammaqbool marked this conversation as resolved.
Show resolved Hide resolved
"time": "08:00:00",
"n": 24
}
}
2 changes: 2 additions & 0 deletions BALSAMIC/constants/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
"annotate": [
"snakemake_rules/annotation/rankscore.rule",
"snakemake_rules/annotation/varcaller_filter_tumor_only.rule",
"snakemake_rules/annotation/msi_tumor_only.rule",
],
},
"paired_targeted": {
Expand Down Expand Up @@ -137,6 +138,7 @@
],
"annotate": [
"snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule",
"snakemake_rules/annotation/msi_tumor_only.rule",
],
},
"paired_wgs": {
Expand Down
57 changes: 57 additions & 0 deletions BALSAMIC/snakemake_rules/annotation/msi_tumor_only.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# vim: syntax=python tabstop=4 expandtab
# coding: utf-8
# Computation of MSI score.

rule msisensorpro_scan_reference:
khurrammaqbool marked this conversation as resolved.
Show resolved Hide resolved
input:
fa = config["reference"]["reference_genome"],
output:
msi_scan = f"{vcf_dir}MSI.somatic.{config['analysis']['case_id']}.msisensorpro.list"
benchmark:
Path(f"{benchmark_dir}msisensorpro_scan_reference_{config['analysis']['case_id']}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("msisensorpro") + ".sif").as_posix()
threads:
get_threads(cluster_config, "msisensorpro_scan_reference")
params:
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
case_id = config["analysis"]["case_id"],
message:
"Scanning microsatellite sites using msisensor-pro for {params.case_id}"
shell:
"""
msisensor-pro scan -d {input.fa} -o {output.msi_scan};

rm -rf {params.tmpdir};
"""

rule msisensorpro_msi_tumor_only:
input:
msi_list=f"{vcf_dir}MSI.somatic.{config['analysis']['case_id']}.msisensorpro.list",
bamT= config_model.get_final_bam_name(bam_dir = bam_dir,sample_name=tumor_sample),
output:
msi_result = f"{vcf_dir}MSI.somatic.{config['analysis']['case_id']}.msisensorpro.msi"
benchmark:
Path(f"{benchmark_dir}/msisensorpro_msi_tumor_normal_{config['analysis']['case_id']}.tsv").as_posix()
khurrammaqbool marked this conversation as resolved.
Show resolved Hide resolved
singularity:
Path(singularity_image,config["bioinfo_tools"].get("msisensorpro") + ".sif").as_posix()
threads:
get_threads(cluster_config,"msisensorpro_msi_tumor_only")
params:
tmpdir=tempfile.mkdtemp(prefix=tmp_dir),
case_id=config["analysis"]["case_id"],
msi_pon_baseline=msi_pon,
message:
"Analysing MSI using msisensor-pro for {params.case_id}"
shell:
"""
if [[ -f "{params.msi_pon_baseline}" ]]; then
msisensor-pro pro -b {threads} -d {params.msi_pon_baseline} -t {input.bamT} -o {params.tmpdir}/msi_{params.case_id};
else
msisensor-pro pro -b {threads} -d {input.msi_list} -t {input.bamT} -o {params.tmpdir}/msi_{params.case_id};
fi

sed 's/\%/MSI/g' {params.tmpdir}/msi_{params.case_id} > {output.msi_result};

rm -rf {params.tmpdir};
"""
13 changes: 13 additions & 0 deletions BALSAMIC/utils/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,19 @@ def get_swegen_sv(config: dict) -> str:
return Path(config["reference"]["swegen_sv_frequency"]).as_posix()


def get_msi_pon(config: dict) -> str:
"""Returns path for MSI PON baseline

Args:
config: a config dictionary

Returns:
Path for MSI PON baseline file

"""
return Path(config["reference"]["msi_pon"]).as_posix()


def dump_toml(annotations: list) -> str:
"""Returns list of converted annotation in toml format

Expand Down
15 changes: 8 additions & 7 deletions BALSAMIC/workflows/balsamic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ from BALSAMIC.utils.rule import (
get_clinical_snv_observations,
get_clinical_sv_observations,
get_fastp_parameters,
get_msi_pon,
get_pon_cnn,
get_result_dir,
get_rule_output,
Expand Down Expand Up @@ -100,6 +101,7 @@ swegen_snv = ""
clinical_sv = ""
somatic_sv = ""
swegen_sv = ""
msi_pon = ""

if config["analysis"]["sequencing_type"] != "wgs":
pon_cnn: str = get_pon_cnn(config)
Expand Down Expand Up @@ -280,6 +282,8 @@ if "cancer_somatic_sv_observations" in config["reference"]:
if "swegen_sv_frequency" in config["reference"]:
swegen_sv: str = get_swegen_sv(config)

if "msi_pon" in config["reference"]:
msi_pon: str = get_msi_pon(config)

# Capture kit name
if config["analysis"]["sequencing_type"] != "wgs":
Expand Down Expand Up @@ -324,11 +328,12 @@ os.environ["TMPDIR"] = get_result_dir(config)

# CNV report input files
cnv_report_paths = []
cnv_report_paths.append(
f"{vcf_dir}MSI.somatic.{config['analysis']['case_id']}.msisensorpro.msi.pdf"
)

if config["analysis"]["sequencing_type"] == "wgs":
if config["analysis"]["analysis_type"] == "paired":
cnv_report_paths.append(
f"{vcf_dir}MSI.somatic.{config['analysis']['case_id']}.msisensorpro.msi.pdf"
)
cnv_report_paths.append(
f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.samplestatistics.txt.pdf"
)
Expand All @@ -353,10 +358,6 @@ if config["analysis"]["sequencing_type"] == "wgs":
)
)
else:
if config["analysis"]["analysis_type"] == "paired":
cnv_report_paths.append(
f"{vcf_dir}MSI.somatic.{config['analysis']['case_id']}.msisensorpro.msi.pdf"
)
cnv_report_paths.extend(
expand(f"{cnv_dir}tumor.merged-{{plot}}.pdf", plot=["diagram", "scatter"])
)
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Added:
^^^^^^
* MSIsensor-pro container https://github.com/Clinical-Genomics/BALSAMIC/pull/1444
* MSI analysis to the tumor-normal workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1454
* MSI analysis to the tumor-only workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/
khurrammaqbool marked this conversation as resolved.
Show resolved Hide resolved
* Sentieon install directory path to case config arguments https://github.com/Clinical-Genomics/BALSAMIC/pull/1461

Changed:
Expand Down
41 changes: 41 additions & 0 deletions tests/commands/config/test_config_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,44 @@ def test_config_tga_with_exome(
assert Path(
analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}"
).exists()


def test_msi_pon_file_tumor_only(
invoke_cli,
tumor_sample_name: str,
analysis_dir: str,
balsamic_cache: str,
panel_bed_file: str,
msi_pon_path: str,
fastq_dir_tumor_only: str,
case_id_tumor_only: str,
):
"""Test balsamic config case with a PON reference."""

# GIVEN CLI arguments including optional pon reference ".cnn" file

# WHEN invoking the config case command
result = invoke_cli(
[
"config",
"case",
"--case-id",
case_id_tumor_only,
"--analysis-dir",
analysis_dir,
"--fastq-path",
fastq_dir_tumor_only,
"-p",
panel_bed_file,
"--msi-pon",
msi_pon_path,
"--balsamic-cache",
balsamic_cache,
"--tumor-sample-name",
tumor_sample_name,
],
)

# THEN program exits and checks for filepath
assert result.exit_code == 0
assert Path(msi_pon_path).exists()
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,12 @@ def swegen_sv_frequency_path(reference_variants_dir_path: str) -> str:
return Path(reference_variants_dir_path, "swegen_sv.vcf.gz").as_posix()


@pytest.fixture(scope="session")
def msi_pon_path(reference_variants_dir_path: str) -> str:
"""Create path for Swegen SVs."""
khurrammaqbool marked this conversation as resolved.
Show resolved Hide resolved
return Path(reference_variants_dir_path, "msi_pon.list").as_posix()


@pytest.fixture(scope="session", name="invalid_json_file")
def fixture_invalid_json_file(session_tmp_path: Path) -> Path:
"""Return a non-existent json file path."""
Expand Down
10 changes: 10 additions & 0 deletions tests/test_data/references/variants/msi_pon.list
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
chromosome location repeat_unit_length repeat_unit_binary repeat_times left_flank_binary right_flank_binary repeat_unit_basesleft_flank_bases right_flank_bases
1 26453 2 11 6 956 999 GT TGTTA TTGCT
1 28588 1 3 15 698 589 T GGTGG GCATC
1 30867 2 7 12 885 319 CT TCTCC CATTT
1 30910 2 13 6 847 627 TC TCATT GCTAT
1 30935 2 13 6 255 1015 TC ATTTT TTTCT
1 31719 1 0 14 466 711 A CTCAG GTACT
1 31807 2 1 5 51 275 AC AATAT CACAT
1 33449 1 0 15 335 645 A CCATT GGACC
1 33530 1 3 11 1021 204 T TTTTC ATATA
Loading