diff --git a/conf/modules.config b/conf/modules.config index 6056952..fb24018 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -48,8 +48,7 @@ process { withName: PREPARE_PREDICTION_INPUT { ext.prefix = {"${meta.sample}"} publishDir = [ - path: { "${params.outdir}/prepare_prediction_input" }, - mode: params.publish_dir_mode + enabled: false ] } @@ -63,14 +62,17 @@ process { withName: MHCFLURRY { publishDir = [ path: { "${params.outdir}/mhcflurry" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + pattern: '*.csv' ] } withName: MHCNUGGETS { + ext.prefix = {"${meta.sample}"} publishDir = [ path: { "${params.outdir}/mhcnuggets" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + pattern: '*predicted_mhcnuggets.csv' ] } @@ -78,19 +80,23 @@ process { ext.args = "-BA" publishDir = [ path: { "${params.outdir}/netmhcpan" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + pattern: '*.xls' ] } withName: NETMHCIIPAN { + ext.prefix = {"${meta.sample}"} ext.args = "-BA" publishDir = [ path: { "${params.outdir}/netmhciipan" }, - mode: params.publish_dir_mode + mode: params.publish_dir_mode, + pattern: '*.xls' ] } withName: MERGE_PREDICTIONS { + ext.prefix = {"${meta.sample}"} publishDir = [ path: { "${params.outdir}/merged_predictions" }, mode: params.publish_dir_mode diff --git a/modules/local/merge_predictions/templates/merge_predictions.py b/modules/local/merge_predictions/templates/merge_predictions.py index 42fcb3a..37e082c 100644 --- a/modules/local/merge_predictions/templates/merge_predictions.py +++ b/modules/local/merge_predictions/templates/merge_predictions.py @@ -5,10 +5,11 @@ from enum import Enum import sys import typing +import math -import mhcgnomes import pandas as pd +# Create logger object with date and time import logging logging.basicConfig( @@ -18,8 +19,9 @@ ) class PredictorBindingThreshold(Enum): - MHCFLURRY = 2 - NETMHCPAN = 2 + MHCFLURRY = 2 + MHCNUGGETS = 0.425 + NETMHCPAN = 2 NETMHCIIPAN = 5 class Arguments: @@ -29,7 +31,7 @@ class Arguments: def __init__(self) -> None: self.input = "$prediction_files".split(" ") - self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.sample" + self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id" self.alleles = sorted("$meta.alleles".split(';')) self.parse_ext_args("$task.ext.args") @@ -86,15 +88,35 @@ def format_yaml_like(data: dict, indent: int = 0) -> str: yaml_str += f"{spaces}{key}: {value}\\n" return yaml_str +class Utils: + @staticmethod + def ic50toBA(ic50: float) -> float: + """Scale IC50 to binding affinity (BA) ranging between 0-1.""" + ic50 = min(ic50, 50000) # Cap IC50 at 50000 + return 1 - (math.log10(ic50)/math.log10(50000)) + + @staticmethod + def BAtoic50(BA: float) -> float: + """Convert binding affinity (BA) to IC50.""" + return 10 ** ((1 - BA) * math.log10(50000)) + class PredictionResult: - """ Class to handle prediction results from different predictors. """ - def __init__(self, file_path, alleles): + def __init__(self, file_path, alleles, threshold=2): self.file_path = file_path self.alleles = alleles + self.threshold = threshold self.predictor = None self.prediction_df = self._format_prediction_result() def _format_prediction_result(self): + """ + Returns a harmonized DataFrame with the prediction results. Output format: + +----------+---------+-------+-------+--------+-----------+ + | sequence | allele | BA | rank | binder | predictor | + +----------+---------+-------+-------+--------+-----------+ + | ... | ... | ... | ... | ... | ... | + +----------+---------+-------+-------+--------+-----------+ + """ if 'syfpeithi' in self.file_path: self.predictor = 'syfpeithi' return self._format_syfpeithi_prediction() @@ -122,35 +144,40 @@ def _format_mhcflurry_prediction(self) -> pd.DataFrame: Read in mhcflurry prediction output comprising the columns `peptide,allele,mhcflurry_affinity,mhcflurry_affinity_percentile,mhcflurry_processing_score, mhcflurry_presentation_score,mhcflurry_presentation_percentile` - - Returns: pd.DataFrame with columns `sequence,allele,PS_Rank,BA_Rank,binder,predictor` """ df = pd.read_csv(self.file_path) - df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "PS_Rank", "mhcflurry_affinity_percentile": "BA_Rank"}, inplace=True) - df = df[['sequence', 'allele', 'PS_Rank', 'BA_Rank']] - df["binder"] = df["PS_Rank"] <= PredictorBindingThreshold.MHCFLURRY.value + # Convert IC50 to BA + df['BA'] = df['mhcflurry_affinity'].apply(Utils.ic50toBA) + # Harmonize df to desired output structure + df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "rank"}, inplace=True) + df = df[['sequence', 'allele', 'rank', 'BA']] + df["binder"] = df["rank"] <= PredictorBindingThreshold.MHCFLURRY.value df["predictor"] = "mhcflurry" return df + def _format_mhcnuggets_prediction(self) -> pd.DataFrame: + """Read in mhcnuggets prediction output comprising the columns `peptide,ic50,human_proteome_rank,allele`""" + df = pd.read_csv(self.file_path) + # Convert IC50 to BA + df['BA'] = df['ic50'].apply(Utils.ic50toBA) + # Harmonize df to desired output structure + df.rename(columns={"peptide": "sequence", "human_proteome_rank": "rank"}, inplace=True) + df = df[['sequence', 'allele', 'rank', 'BA']] + # Use IC50 < 500 as threshold since mhcnuggets provides a different ranking compared to other predictors + df["binder"] = df["BA"] >= PredictorBindingThreshold.MHCNUGGETS.value + df["predictor"] = "mhcnuggets" - def _format_mhcnuggets_prediction(self): - pass + return df def _format_netmhcpan_prediction(self) -> pd.DataFrame: - """ - Read in netmhcpan prediction output comprising the columns - `Peptide,Rank,EL_Rank,BA_Rank` for multiple alleles. - - Returns: pd.DataFrame with columns `sequence,allele,EL_Rank,BA_Rank,binder,predictor` - """ - # Map with allele index to allele name.NetMHCpan sorts alleles alphabetically + # Map with allele index to allele name alleles_dict = {i: allele for i, allele in enumerate(self.alleles)} # Read the file into a DataFrame with no headers initially df = pd.read_csv(self.file_path, sep='\t', skiprows=1) - df = df[df.columns[df.columns.str.contains('Peptide|EL_Rank|BA_Rank')]] - # TODO: Naming needs to be harmonized down the line once all predictors are implemented - df = df.rename(columns={'Peptide':'sequence','EL_Rank':'EL_Rank.0','BA_Rank':'BA_Rank.0'}) + # Extract Peptide, percentile rank, binding affinity + df = df[df.columns[df.columns.str.contains('Peptide|EL_Rank|BA-score')]] + df = df.rename(columns={'Peptide':'sequence','EL_Rank':'EL_Rank.0','BA-score':'BA-score.0'}) # to longformat based on .0|1|2.. df_long = pd.melt( df, @@ -159,14 +186,15 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame: var_name="metric", value_name="value", ) + # Extract the allele information (e.g., .0, .1, etc.) df_long["allele"] = df_long["metric"].str.split('.').str[1] - df_long["metric"] = df_long["metric"].str.split('.').str[0] + df_long["metric"] = df_long["metric"].apply(lambda x: x.split('.')[0].replace('EL_Rank','rank').replace('BA-score','BA')) # Pivot table to organize columns properly df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index() df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']] - df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCPAN.value + df_pivot['binder'] = df_pivot['rank'] <= self.threshold df_pivot['predictor'] = 'netmhcpan' df_pivot.index.name = '' @@ -174,18 +202,16 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame: def _format_netmhciipan_prediction(self) -> pd.DataFrame: """ - Read in netmhciipan prediction output comprising the columns - `Peptide,Rank,Rank_BA` for multiple alleles. - - Returns: pd.DataFrame with columns `sequence,allele,Rank,Rank_BA,binder,predictor` + Read in netmhciipan prediction output and extract the columns + `Peptide,Rank,Score_BA` for multiple alleles. """ # Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically alleles_dict = {i: allele for i, allele in enumerate(self.alleles)} # Read the file into a DataFrame with no headers initially df = pd.read_csv(self.file_path, sep='\t', skiprows=1) - df = df[df.columns[df.columns.str.contains('Peptide|Rank|Rank_BA')]] - # TODO: Naming needs to be harmonized down the line once all predictors are implemented - df = df.rename(columns={'Peptide':'sequence','Rank':'Rank.0','Rank_BA':'Rank_BA.0'}) + # Extract Peptide, percentile rank, binding affinity + df = df[df.columns[df.columns.str.contains('Peptide|Rank(?!_BA)|Score_BA')]] + df = df.rename(columns={'Peptide':'sequence','Rank':'Rank.0','Score_BA':'Score_BA.0'}) # to longformat based on .0|1|2.. df_long = pd.melt( df, @@ -196,13 +222,12 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame: ) # Extract the allele information (e.g., .0, .1, etc.) df_long["allele"] = df_long["metric"].str.split('.').str[1] - df_long["metric"] = df_long["metric"].str.split('.').str[0] + df_long["metric"] = df_long["metric"].apply(lambda x: x.split('.')[0].replace('Rank','rank').replace('Score_BA','BA')) # Pivot table to organize columns properly df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index() df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']] - df_pivot.rename(columns={"Rank": "EL_Rank", "Rank_BA": "BA_Rank"}, inplace=True) - df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value + df_pivot['binder'] = df_pivot['rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value df_pivot['predictor'] = 'netmhciipan' df_pivot.index.name = '' @@ -211,11 +236,16 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame: def main(): args = Arguments() + # Iterate over each file predicted by multiple predictors, harmonize and merge output + output_df = [] for file in args.input: result = PredictionResult(file, args.alleles) logging.info(f"Writing {len(result.prediction_df)} {result.predictor} predictions to file..") - result.prediction_df.to_csv(f"{args.prefix}_{result.predictor}.tsv", sep="\t", index=False) + output_df.append(result.prediction_df) + + output_df = pd.concat(output_df) + output_df.to_csv(f'{args.prefix}_predictions.tsv', sep='\t', index=False) # Parse versions versions_this_module = {} diff --git a/modules/local/mhcnuggets/environment.yml b/modules/local/mhcnuggets/environment.yml new file mode 100644 index 0000000..e69de29 diff --git a/modules/local/mhcnuggets.nf b/modules/local/mhcnuggets/main.nf similarity index 67% rename from modules/local/mhcnuggets.nf rename to modules/local/mhcnuggets/main.nf index a196eb5..73d0649 100644 --- a/modules/local/mhcnuggets.nf +++ b/modules/local/mhcnuggets/main.nf @@ -4,21 +4,19 @@ process MHCNUGGETS { conda "bioconda::mhcnuggets=2.4.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcnuggets:2.4.1--pyh7cba7a3_0' : - 'quay.io/biocontainers/mhcnuggets:2.4.1--pyh7cba7a3_0' }" + 'https://depot.galaxyproject.org/singularity/mhcnuggets:2.4.0--pyh7cba7a3_0' : + 'quay.io/biocontainers/mhcnuggets:2.4.0--pyh7cba7a3_0' }" input: - tuple val(meta), path(peptide_file) + tuple val(meta), path(tsv) output: - tuple val(meta), path("*.tsv"), emit: predicted - path "versions.yml", emit: versions + tuple val(meta), path("*_predicted_mhcnuggets.csv"), emit: predicted + path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample - """ - """ + + template "mhcnuggets.py" stub: def args = task.ext.args ?: '' diff --git a/modules/local/mhcnuggets/meta.yml b/modules/local/mhcnuggets/meta.yml new file mode 100644 index 0000000..e69de29 diff --git a/modules/local/mhcnuggets/templates/mhcnuggets.py b/modules/local/mhcnuggets/templates/mhcnuggets.py new file mode 100644 index 0000000..ea478ca --- /dev/null +++ b/modules/local/mhcnuggets/templates/mhcnuggets.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +import argparse +import shlex +import logging + +import pandas as pd +from mhcnuggets.src.predict import predict + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +class Arguments: + """ + Parses the arguments, including the ones coming from $task.ext.args. + """ + + def __init__(self) -> None: + self.input = "$tsv" + self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id" + self.mhc_class = "$meta.mhc_class" + self.alleles = "$meta.alleles".split(";") + self.parse_ext_args("$task.ext.args") + + def parse_ext_args(self, args_string: str) -> None: + """ + Parse the extended arguments. + """ + # skip when there are no extended arguments + if args_string == "null": + args_string = "" + + # Parse the extended arguments + args_list = shlex.split(args_string) # Split the string into a list of arguments + parser = argparse.ArgumentParser() + # input parameters + args = parser.parse_args(args_list) + + # Assign args attributes to self attributes + for attr in vars(args): + setattr(self, attr, getattr(args, attr)) + + +class Version: + """ + Parse the versions of the modules used in the script. + """ + + @staticmethod + def get_versions(modules: list) -> dict: + """ + This function takes a list of modules and returns a dictionary with the + versions of each module. + """ + return {module.__name__: module.__version__ for module in modules} + + @staticmethod + def format_yaml_like(data: dict, indent: int = 0) -> str: + """ + Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + yaml_str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{Version.format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def main(): + args = Arguments() + + df_input = pd.read_csv(args.input, sep="\t") + logging.info(f"Reading in file with {len(df_input)} peptides..") + + # Predict and load written tsv file + predicted_df = [] + for allele in args.alleles: + mhcnuggets_allele = 'HLA-'+allele.replace('*','') + predict(class_=args.mhc_class, peptides_path = args.input, mhc=mhcnuggets_allele, + output=f'{args.prefix}_{allele}.csv', rank_output=True) + + tmp_df = pd.read_csv(f'{args.prefix}_{allele}_ranks.csv') + tmp_df['allele'] = allele + predicted_df.append(tmp_df) + + # Append dataframe per allele and write file + predicted_df = pd.concat(predicted_df) + predicted_df.to_csv(f'{args.prefix}_predicted_mhcnuggets.csv', index=False) + + # Parse versions + versions_this_module = {} + versions_this_module["${task.process}"] = Version.get_versions([argparse, pd]) + with open("versions.yml", "w") as f: + f.write(Version.format_yaml_like(versions_this_module)) + # No __version__ dunder or similar available, need to hardcode version + f.write('mhcnuggets: 2.4.0') + + +if __name__ == "__main__": + main() diff --git a/modules/local/netmhciipan.nf b/modules/local/netmhciipan.nf index e620ead..8cf60c0 100644 --- a/modules/local/netmhciipan.nf +++ b/modules/local/netmhciipan.nf @@ -16,7 +16,7 @@ process NETMHCIIPAN { error "NETMHCIIPAN only supports MHC class II. Use NETMHCIIPAN for MHC class II." } def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def prefix = task.ext.prefix ?: "${meta.id}" // Adjust for netMHCIIpan allele format def alleles = meta.alleles.tokenize(';').collect { it.contains('DRB') ? diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py index 9748e86..ebd1144 100644 --- a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py +++ b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py @@ -14,11 +14,14 @@ ) class MinLength(Enum): MHCFLURRY = 5 + MHCNUGGETS = 1 NETMHCPAN = 8 NETMHCIIPAN = 8 class MaxLength(Enum): MHCFLURRY = 15 + MHCNUGGETS_CLASSI = 15 + MHCNUGGETS_CLASSII = 30 NETMHCPAN = 14 NETMHCIIPAN = 25 @@ -122,6 +125,14 @@ def main(): df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True) df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False) + if "mhcnuggets" in args.tools: + if args.mhc_class == "I": + df_mhcnuggets = df[df["sequence"].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSI.value)] + else: + df_mhcnuggets = df[df["sequence"].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSII.value)] + logging.info(f"Input for MHCnuggets detected. Preparing {len(df_mhcnuggets)} peptides for prediction..") + df_mhcnuggets[['sequence']].to_csv(f'{args.prefix}_mhcnuggets_input.tsv', sep="\t", header=False, index=False) + if "netmhcpan" in args.tools and args.mhc_class == "I": df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)] logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..") diff --git a/nextflow_schema.json b/nextflow_schema.json index eb9df96..51fa743 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -96,7 +96,6 @@ "tools": { "type": "string", "default": "syfpeithi", - "pattern": "^(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.3|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1)(,(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.3|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1))*$", "help_text": "Specifies the tool(s) to use. Multiple tools can be combined in a list separated by comma.\nAvailable are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`,`netmhcpan-4.0`,`netmhcpan-4.1`,`netmhc-4.0`,`netmhciipan-4.3`.", "description": "Specifies the prediction tool(s) to use." }, diff --git a/subworkflows/local/mhc_binding_prediction.nf b/subworkflows/local/mhc_binding_prediction.nf index cdcae8f..0eb1ac6 100755 --- a/subworkflows/local/mhc_binding_prediction.nf +++ b/subworkflows/local/mhc_binding_prediction.nf @@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION { return [meta, file] } .set{ ch_prediction_input } - ch_prediction_input.mhcflurry.view() + SYFPEITHI ( ch_prediction_input.syfpeithi ) ch_versions = ch_versions.mix(SYFPEITHI.out.versions) ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted) @@ -76,8 +76,8 @@ workflow MHC_BINDING_PREDICTION { ch_versions = ch_versions.mix(NETMHCIIPAN.out.versions) ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCIIPAN.out.predicted) } - ch_binding_predictors_out.groupTuple().view() - MERGE_PREDICTIONS (ch_binding_predictors_out.groupTuple()) + + MERGE_PREDICTIONS( ch_binding_predictors_out.groupTuple() ) ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions) emit: diff --git a/subworkflows/local/utils_nfcore_epitopeprediction_pipeline/main.nf b/subworkflows/local/utils_nfcore_epitopeprediction_pipeline/main.nf index fb5ae5c..1062872 100644 --- a/subworkflows/local/utils_nfcore_epitopeprediction_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_epitopeprediction_pipeline/main.nf @@ -66,7 +66,7 @@ workflow PIPELINE_INITIALISATION { // // Custom validation for pipeline parameters // - //validateInputParameters() + validateInputParameters() // // Create channel from input file provided through params.input @@ -137,9 +137,31 @@ workflow PIPELINE_COMPLETION { // // Check and validate pipeline parameters // -//def validateInputParameters() { -// genomeExistsError() -//} +def validateInputParameters() { + validate_tools_param() +} + +// +// Check if supported tools are specified +// +def validate_tools_param() { + // List of valid tools + //def valid_tools = [ + // 'syfpeithi', 'mhcnuggets-class-1', 'mhcnuggets-class-2', 'mhcflurry', + // 'netmhc-4.0', 'netmhcpan-4.0', 'netmhcpan-4.1', 'netmhciipan-4.3', + // 'netmhc_darwin-4.0', 'netmhcpan_darwin-4.0', 'netmhcpan_darwin-4.1', 'netmhciipan_darwin-4.1' + //] + valid_tools = [ + 'syfpeithi', 'mhcnuggets', 'mhcflurry','netmhcpan-4.1', 'netmhciipan-4.3' + ] + tools = params.tools.tokenize(',') + + // Validate each tool in tools if it's in valid_tools + def invalid_tools = tools.findAll { it.trim() !in valid_tools } + if (invalid_tools) { + throw new IllegalArgumentException("Invalid tools found: ${invalid_tools.join(', ')}") + } +} // // Prepare import of NetMHC software