Merge pull request #262 from jonasscheid/handle-supported-alleles

Add supported_alleles.json to handle supported alleles per predictor
nf-core · Jan 29, 2025 · eef4be2 · eef4be2
2 parents c1efc39 + 97d7b63
commit eef4be2
Show file tree

Hide file tree

Showing 13 changed files with 163 additions and 79 deletions.
diff --git a/assets/supported_alleles.json b/assets/supported_alleles.json
diff --git a/modules/local/merge_predictions/main.nf b/modules/local/merge_predictions/main.nf
@@ -2,10 +2,10 @@ process MERGE_PREDICTIONS {
     label 'process_single'
     tag "${meta.sample}"
 
-    conda "bioconda::mhcgnomes=1.8.4"
+    conda "bioconda::mhcgnomes=1.8.6"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.4--pyh7cba7a3_0' :
-        'quay.io/biocontainers/mhcgnomes:1.8.4--pyh7cba7a3_0' }"
+        'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.6--pyh7cba7a3_0' :
+        'biocontainers/mhcgnomes:1.8.6--pyh7cba7a3_0' }"
 
     input:
     tuple val(meta), path(prediction_files)

diff --git a/modules/local/merge_predictions/templates/merge_predictions.py b/modules/local/merge_predictions/templates/merge_predictions.py
@@ -8,6 +8,7 @@
 import math
 
 import pandas as pd
+import mhcgnomes
 
 # Create logger object with date and time
 import logging
@@ -124,7 +125,7 @@ def _format_prediction_result(self):
             self.predictor = 'mhcflurry'
             return self._format_mhcflurry_prediction()
         elif 'mhcnuggets' in self.file_path:
-            self.predictor = 'mhcnuggets'
+            self.predictor = 'mhcnuggetsii' if 'mhcnuggetsii' in self.file_path else 'mhcnuggets'
             return self._format_mhcnuggets_prediction()
         elif 'netmhcpan' in self.file_path:
             self.predictor = 'netmhcpan'
@@ -152,7 +153,7 @@ def _format_mhcflurry_prediction(self) -> pd.DataFrame:
         df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "rank"}, inplace=True)
         df = df[['sequence', 'allele', 'rank', 'BA']]
         df["binder"] = df["rank"] <= PredictorBindingThreshold.MHCFLURRY.value
-        df["predictor"] = "mhcflurry"
+        df["predictor"] = self.predictor
 
         return df
 
@@ -166,7 +167,7 @@ def _format_mhcnuggets_prediction(self) -> pd.DataFrame:
         df = df[['sequence', 'allele', 'rank', 'BA']]
         # Use IC50 < 500 as threshold since mhcnuggets provides a different ranking compared to other predictors
         df["binder"] = df["BA"] >= PredictorBindingThreshold.MHCNUGGETS.value
-        df["predictor"] = "mhcnuggets"
+        df["predictor"] = self.predictor
 
         return df
 
@@ -195,7 +196,7 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
         df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
         df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
         df_pivot['binder'] = df_pivot['rank'] <= self.threshold
-        df_pivot['predictor'] = 'netmhcpan'
+        df_pivot['predictor'] = self.predictor
         df_pivot.index.name = ''
 
         return df_pivot
@@ -228,7 +229,7 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame:
         df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
         df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
         df_pivot['binder'] = df_pivot['rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
-        df_pivot['predictor'] = 'netmhciipan'
+        df_pivot['predictor'] = self.predictor
         df_pivot.index.name = ''
 
         return df_pivot
@@ -245,11 +246,13 @@ def main():
         output_df.append(result.prediction_df)
 
     output_df = pd.concat(output_df)
+    # Normalize allele names and write to file
+    output_df['allele'] = output_df['allele'].apply(lambda x : mhcgnomes.parse(x).to_string())
     output_df.to_csv(f'{args.prefix}_predictions.tsv', sep='\t', index=False)
 
     # Parse versions
     versions_this_module = {}
-    versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])
+    versions_this_module["${task.process}"] = Version.get_versions([argparse, pd, mhcgnomes])
     with open("versions.yml", "w") as f:
         f.write(Version.format_yaml_like(versions_this_module))
 

diff --git a/modules/local/mhcflurry.nf b/modules/local/mhcflurry.nf
@@ -15,14 +15,14 @@ process MHCFLURRY {
 
     output:
     tuple val(meta), path("*.csv"), emit: predicted
-    path "versions.yml", emit: versions
+    path "versions.yml"           , emit: versions
 
     script:
     if (meta.mhc_class == "II") {
         error("MHCflurry prediction of ${meta.sample} is not possible with MHC class II!")
     }
     def args   = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: meta.sample
+    def prefix = task.ext.prefix ?: "${meta.sample}"
 
     """
     mhcflurry-downloads fetch models_class1_presentation

diff --git a/modules/local/mhcnuggets/main.nf b/modules/local/mhcnuggets/main.nf
@@ -11,7 +11,7 @@ process MHCNUGGETS {
     tuple val(meta), path(tsv)
 
     output:
-    tuple val(meta), path("*_predicted_mhcnuggets.csv"), emit: predicted
+    tuple val(meta), path("*{_predicted_mhcnuggets.csv,_predicted_mhcnuggetsii.csv}"), emit: predicted
     path "versions.yml"                                , emit: versions
 
     script:
@@ -20,7 +20,7 @@ process MHCNUGGETS {
 
     stub:
     def args       = task.ext.args ?: ''
-    def prefix     = task.ext.prefix ?: meta.sample
+    def prefix     = task.ext.prefix ?: "${meta.sample}"
     """
     touch ${prefix}_predicted_mhcnuggets.tsv
 

diff --git a/modules/local/mhcnuggets/templates/mhcnuggets.py b/modules/local/mhcnuggets/templates/mhcnuggets.py
@@ -22,7 +22,7 @@ def __init__(self) -> None:
         self.input = "$tsv"
         self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
         self.mhc_class = "$meta.mhc_class"
-        self.alleles = "$meta.alleles".split(";")
+        self.alleles = "$meta.alleles_supported".split(";")
         self.parse_ext_args("$task.ext.args")
 
     def parse_ext_args(self, args_string: str) -> None:
@@ -87,7 +87,7 @@ def main():
     # Predict and load written tsv file
     predicted_df = []
     for allele in args.alleles:
-        mhcnuggets_allele = 'HLA-'+allele.replace('*','')
+        mhcnuggets_allele = allele.replace('*','')
         predict(class_=args.mhc_class, peptides_path = args.input, mhc=mhcnuggets_allele,
                 output=f'{args.prefix}_{allele}.csv', rank_output=True)
 
@@ -97,7 +97,8 @@ def main():
 
     # Append dataframe per allele and write file
     predicted_df = pd.concat(predicted_df)
-    predicted_df.to_csv(f'{args.prefix}_predicted_mhcnuggets.csv', index=False)
+    filename_out = f'{args.prefix}_predicted_mhcnuggets.csv' if args.mhc_class == 'I' else f'{args.prefix}_predicted_mhcnuggetsii.csv'
+    predicted_df.to_csv(filename_out, index=False)
 
     # Parse versions
     versions_this_module = {}
@@ -106,7 +107,7 @@ def main():
         f.write(Version.format_yaml_like(versions_this_module))
         # No __version__ dunder or similar available, need to hardcode version
         f.write('mhcnuggets: 2.4.0')
-        
+
 
 if __name__ == "__main__":
     main()
diff --git a/modules/local/netmhciipan.nf b/modules/local/netmhciipan.nf
@@ -17,12 +17,13 @@ process NETMHCIIPAN {
     }
     def args       = task.ext.args ?: ''
     def prefix     = task.ext.prefix ?: "${meta.id}"
-    // Adjust for netMHCIIpan allele format
-    def alleles = meta.alleles.tokenize(';').collect {
-                    it.contains('DRB') ?
-                        it.replace('*', '_').replace(':', '') :
-                        ('HLA-' + it.replace('*', '').replace(':', ''))
-                }.join(',')
+    // Adjust for netMHCIIpan allele format (e.g. DRB1_0101, HLA-DPA10103-DPB10101)
+    def alleles = meta.alleles_supported.tokenize(';')
+                    .collect {
+                        it.contains('DRB') ?
+                            it.replace('*', '_').replace(':', '').replace('HLA-', '') :
+                            it.replace('*', '').replace(':', '').replace('/','-')
+                    }.join(',')
 
     """
     netmhciipan/netMHCIIpan \

diff --git a/modules/local/netmhcpan.nf b/modules/local/netmhcpan.nf
@@ -17,8 +17,7 @@ process NETMHCPAN {
     }
     def args       = task.ext.args ?: ''
     def prefix     = task.ext.prefix ?: meta.sample
-    // A*01:217 to HLA-A01:217 for meta.alleles: Add HLA- to the allele names and strip the *.
-    def alleles    = meta.alleles.tokenize(';').collect { 'HLA-' + it.replace('*', '') }.join(',')
+    def alleles    = meta.alleles_supported.tokenize(';').collect { it.replace('*', '') }.join(',')
 
     """
     netmhcpan/netMHCpan \

diff --git a/modules/local/prepare_prediction_input/main.nf b/modules/local/prepare_prediction_input/main.nf
@@ -2,20 +2,20 @@ process PREPARE_PREDICTION_INPUT {
     label 'process_single'
     tag "${meta.sample}"
 
-    conda "bioconda::mhcgnomes=1.8.4"
+    conda "bioconda::mhcgnomes=1.8.6"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.4--pyh7cba7a3_0' :
-        'quay.io/biocontainers/mhcgnomes:1.8.4--pyh7cba7a3_0' }"
+        'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.6--pyh7cba7a3_0' :
+        'biocontainers/mhcgnomes:1.8.6--pyh7cba7a3_0' }"
 
     input:
     tuple val(meta), path(tsv)
+    path(supported_alleles_json)
 
     output:
-    tuple val(meta), path("*.{csv,tsv}"), emit: prepared
-    path "versions.yml", emit: versions
+    tuple val(meta), path("*.json"), path("*.{csv,tsv}"), emit: prepared
+    path "versions.yml"                                 , emit: versions
 
     script:
-
     template "prepare_prediction_input.py"
 
     stub:

diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py
@@ -4,8 +4,10 @@
 import shlex
 from enum import Enum
 import logging
+import json
 
 import pandas as pd
+import mhcgnomes
 
 logging.basicConfig(
     level=logging.INFO,
@@ -37,14 +39,16 @@ class Arguments:
 
     def __init__(self) -> None:
         self.input = "$tsv"
+        self.supported_alleles_json = "$supported_alleles_json"
         self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
         self.mhc_class = "$meta.mhc_class"
-        self.alleles = "$meta.alleles".split(";")
-        self.tools = "$params.tools"
+        self.alleles = "$meta.alleles"
+        self.tools = "$params.tools".split(',')
         self.min_peptide_length_classI = int("$params.min_peptide_length_classI")
         self.max_peptide_length_classI = int("$params.max_peptide_length_classI")
         self.min_peptide_length_classII = int("$params.min_peptide_length_classII")
         self.max_peptide_length_classII = int("$params.max_peptide_length_classII")
+        self.peptide_col_name = "sequence"
         self.parse_ext_args("$task.ext.args")
 
     def parse_ext_args(self, args_string: str) -> None:
@@ -100,52 +104,121 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
                 yaml_str += f"{spaces}{key}: {value}\\n"
         return yaml_str
 
+
+class Utils:
+    def parse_alleles(allele_str: str) -> str:
+        """
+        Function that parses optional txt allele input and
+        normalizes alleles using mhcgnomes
+        Args:
+            alleles (str): File path (one allele per line) or string in required format ("Allele1;Allele2;..")
+
+        Returns:
+            alleles_normalized: mhcgnomes-normalized allele string in format "Allele1;Allele2;.."
+        """
+        if allele_str.endswith(('.txt','.csv','.tsv')):
+            with open(allele_str, 'r') as f:
+                alleles = [line.strip() for line in f.readlines()]
+        else:
+            alleles = allele_str.split(";")
+        alleles_normalized = [mhcgnomes.parse(allele).to_string() for allele in alleles]
+
+        return alleles_normalized
+
+    def keep_supported_alleles(allele_ls: list, tool: str, supported_alleles_tool: dict) -> list:
+        """
+        Function that filters out unsupported alleles of predictor for a given MHC class
+        Args:
+            allele_ls (list): List of alleles in mhcgnomes-normalized format
+            tool (str): Name of predictor tool
+            supported_alleles (dict): Dictionary with supported alleles or predictor
+
+        Returns:
+            tool_allele_input: List of supported alleles for the given tool
+        """
+        tool_allele_input = [allele for allele in allele_ls if allele in supported_alleles_tool]
+        if len(tool_allele_input) == 0:
+            logging.warning(f"No supported alleles for {tool} found. Aborting..")
+        elif len(tool_allele_input) < len(allele_ls):
+            logging.warning(f"Provided alleles {allele_ls}, but only {tool_allele_input} are supported by {tool}. Continuing with supported alleles..")
+
+        return tool_allele_input
+
+
 def main():
     args = Arguments()
 
+    ### DEBUGGING INPUT
+    #args.input = '/mnt/volume/data/epitopeprediction/peptides.tsv'
+    #args.supported_alleles_json = '/mnt/volume/dev/mhcgnomes/supported_alleles.json'
+    #args.mhc_class = 'I'
+    #args.alleles='/mnt/volume/dev/epitopeprediction-2/modules/local/check_supported_alleles/templates/test_alleles.txt'
+    #args.tools = ['mhcflurry', 'mhcnuggets', 'mhcnuggetsii', 'netmhcpan','netmhciipan']
+    #args.prefix = "test"
+    #args.min_peptide_length_classI = 8
+    #args.max_peptide_length_classI = 12
+    #args.min_peptide_length_classII = 12
+    #args.max_peptide_length_classII = 25
+    #args.peptide_col_name = "sequence"
+    ###
+
+    # Parse alleles to uniform format
+    alleles_normalized = Utils.parse_alleles(args.alleles)
+    supported_alleles_dict = json.load(open(args.supported_alleles_json))
+    # Filter alleles based on supported alleles for each tool and write to json
+    tools_allele_input = {}
+    for tool in args.tools:
+        tool_allele_input = Utils.keep_supported_alleles(alleles_normalized, tool, supported_alleles_dict[tool])
+        tools_allele_input[tool] = ';'.join(tool_allele_input)
+    with open(f"{args.prefix}_allele_input.json", "w") as f:
+        json.dump(tools_allele_input, f)
+
+    # Parse input file to desired format of tools
     df_input = pd.read_csv(args.input, sep="\t")
-    logging.info(f"Reading in file with {len(df_input)} peptides..")
+    logging.info(f"Read file with {len(df_input)} peptides.")
 
     # Filter peptides based on user-defined length
     if args.mhc_class == "I":
-        df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
+        df = df_input[df_input[args.peptide_col_name].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
     else:
-        df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
+        df = df_input[df_input[args.peptide_col_name].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
 
     if len(df) == 0:
         raise ValueError("No peptides left after applying length filters! Aborting..")
 
     # Filter peptides based on tool length boundaries and adjust input format
     if "mhcflurry" in args.tools and args.mhc_class == "I":
-        df_mhcflurry = df[df["sequence"].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
-        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
+        df_mhcflurry = df[df[args.peptide_col_name].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
+        logging.info(f"Input for MHCflurry detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
         # Get every combination of sequence and allele and write them to csv with columns sequence and allele
-        df_mhcflurry['allele'] = [args.alleles] * len(df_mhcflurry)
+        df_mhcflurry['allele'] = [tools_allele_input['mhcflurry'].split(';')] * len(df_mhcflurry)
         df_mhcflurry = df_mhcflurry.explode('allele').reset_index(drop=True)
-        df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True)
+        df_mhcflurry.rename(columns={args.peptide_col_name: "peptide"}, inplace=True)
         df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False)
 
-    if "mhcnuggets" in args.tools:
-        if args.mhc_class == "I":
-            df_mhcnuggets = df[df["sequence"].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSI.value)]
-        else:
-            df_mhcnuggets = df[df["sequence"].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSII.value)]
+    if "mhcnuggets" in args.tools and args.mhc_class == "I":
+        df_mhcnuggets = df[df[args.peptide_col_name].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSI.value)]
         logging.info(f"Input for MHCnuggets detected. Preparing {len(df_mhcnuggets)} peptides for prediction..")
         df_mhcnuggets[['sequence']].to_csv(f'{args.prefix}_mhcnuggets_input.tsv', sep="\t", header=False, index=False)
 
-    if "netmhcpan" in args.tools and args.mhc_class == "I":
-        df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
+    if "mhcnuggetsii" in args.tools and args.mhc_class == "II":
+        df_mhcnuggets_ii = df[df[args.peptide_col_name].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSII.value)]
+        logging.info(f"Input for MHCnuggets II detected. Preparing {len(df_mhcnuggets_ii)} peptides for prediction..")
+        df_mhcnuggets_ii[['sequence']].to_csv(f'{args.prefix}_mhcnuggetsii_input.tsv', sep="\t", header=False, index=False)
+
+    if "netmhcpan-4.1" in args.tools and args.mhc_class == "I":
+        df_netmhcpan = df[df[args.peptide_col_name].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
         logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..")
         df_netmhcpan[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)
 
-    elif "netmhciipan" in args.tools and args.mhc_class == "II":
-        df_netmhciipan = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
-        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
+    elif "netmhciipan-4.3" in args.tools and args.mhc_class == "II":
+        df_netmhciipan = df[df[args.peptide_col_name].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
+        logging.info(f"Input for NetMHCIIpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
         df_netmhciipan[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
 
     # Parse versions
     versions_this_module = {}
-    versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])
+    versions_this_module["${task.process}"] = Version.get_versions([argparse, pd, mhcgnomes])
     with open("versions.yml", "w") as f:
         f.write(Version.format_yaml_like(versions_this_module))