Merge pull request #253 from jonasscheid/add-mhcflurry-module

Add initial mhcflurry module
nf-core · Dec 17, 2024 · 67bc3a2 · 67bc3a2
2 parents cea2e54 + 0610c87
commit 67bc3a2
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 37 deletions.
diff --git a/conf/modules.config b/conf/modules.config
@@ -74,22 +74,29 @@ process {
             ]
         }
 
-    withName: NETMHCPAN{
+    withName: NETMHCPAN {
         ext.args = "-BA"
         publishDir = [
             path: { "${params.outdir}/netmhcpan" },
             mode: params.publish_dir_mode
         ]
     }
 
-    withName: NETMHCIIPAN{
+    withName: NETMHCIIPAN {
         ext.args = "-BA"
         publishDir = [
             path: { "${params.outdir}/netmhciipan" },
             mode: params.publish_dir_mode
         ]
     }
 
+    withName: MERGE_PREDICTIONS {
+        publishDir = [
+            path: { "${params.outdir}/merged_predictions" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
     withName: EPYTOPE_GENERATE_PEPTIDES {
         publishDir = [
             path: { "${params.outdir}/generated_peptides/${meta.sample}" },

diff --git a/modules/local/merge_predictions/templates/merge_predictions.py b/modules/local/merge_predictions/templates/merge_predictions.py
@@ -18,6 +18,7 @@
 )
 
 class PredictorBindingThreshold(Enum):
+    MHCFLURRY = 2
     NETMHCPAN = 2
     NETMHCIIPAN = 5
 
@@ -86,6 +87,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
         return yaml_str
 
 class PredictionResult:
+    """ Class to handle prediction results from different predictors. """
     def __init__(self, file_path, alleles):
         self.file_path = file_path
         self.alleles = alleles
@@ -115,13 +117,33 @@ def _format_prediction_result(self):
     def _format_syfpeithi_prediction(self):
         pass
 
-    def _format_mhcflurry_prediction(self):
-        pass
+    def _format_mhcflurry_prediction(self) -> pd.DataFrame:
+        """
+        Read in mhcflurry prediction output comprising the columns
+        `peptide,allele,mhcflurry_affinity,mhcflurry_affinity_percentile,mhcflurry_processing_score,
+        mhcflurry_presentation_score,mhcflurry_presentation_percentile`
+
+        Returns: pd.DataFrame with columns `sequence,allele,PS_Rank,BA_Rank,binder,predictor`
+        """
+        df = pd.read_csv(self.file_path)
+        df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "PS_Rank", "mhcflurry_affinity_percentile": "BA_Rank"}, inplace=True)
+        df = df[['sequence', 'allele', 'PS_Rank', 'BA_Rank']]
+        df["binder"] = df["PS_Rank"] <= PredictorBindingThreshold.MHCFLURRY.value
+        df["predictor"] = "mhcflurry"
+
+        return df
+
 
     def _format_mhcnuggets_prediction(self):
         pass
 
     def _format_netmhcpan_prediction(self) -> pd.DataFrame:
+        """
+        Read in netmhcpan prediction output comprising the columns
+        `Peptide,Rank,EL_Rank,BA_Rank` for multiple alleles.
+
+        Returns: pd.DataFrame with columns `sequence,allele,EL_Rank,BA_Rank,binder,predictor`
+        """
         # Map with allele index to allele name.NetMHCpan sorts alleles alphabetically
         alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
         # Read the file into a DataFrame with no headers initially
@@ -151,6 +173,12 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
         return df_pivot
 
     def _format_netmhciipan_prediction(self) -> pd.DataFrame:
+        """
+        Read in netmhciipan prediction output comprising the columns
+        `Peptide,Rank,Rank_BA` for multiple alleles.
+
+        Returns: pd.DataFrame with columns `sequence,allele,Rank,Rank_BA,binder,predictor`
+        """
         # Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically
         alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
         # Read the file into a DataFrame with no headers initially
@@ -173,7 +201,8 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame:
         # Pivot table to organize columns properly
         df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
         df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
-        df_pivot['binder'] = df_pivot['Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
+        df_pivot.rename(columns={"Rank": "EL_Rank", "Rank_BA": "BA_Rank"}, inplace=True)
+        df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
         df_pivot['predictor'] = 'netmhciipan'
         df_pivot.index.name = ''
 

diff --git a/modules/local/mhcflurry.nf b/modules/local/mhcflurry.nf
@@ -2,33 +2,46 @@ process MHCFLURRY {
     label 'process_single'
     tag "${meta.sample}"
 
-    conda "bioconda::mhcflurry=2.0.6"
+    conda "bioconda::mhcflurry=2.1.4"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.4--pyh7e72e81_0' :
-        'quay.io/biocontainers/mhcflurry:2.1.4--pyh7e72e81_0' }"
+        'https://depot.galaxyproject.org/singularity/mhcflurry:2.1.4--pyh7e72e81_1' :
+        'quay.io/biocontainers/mhcflurry:2.1.4--pyh7e72e81_1' }"
+
+    // MHCflurry downloads models always to ~/.local/share/mhcflurry
+    containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : ''
 
     input:
     tuple val(meta), path(peptide_file)
 
     output:
-    tuple val(meta), path("*.tsv"), emit: predicted
+    tuple val(meta), path("*.csv"), emit: predicted
     path "versions.yml", emit: versions
 
     script:
     if (meta.mhc_class == "II") {
         error("MHCflurry prediction of ${meta.sample} is not possible with MHC class II!")
     }
-    def args       = task.ext.args ?: ''
-    def prefix     = task.ext.prefix ?: meta.sample
+    def args   = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: meta.sample
 
     """
+    mhcflurry-downloads fetch models_class1_presentation
+    mhcflurry-predict \\
+        $peptide_file \\
+        --out ${prefix}_predicted_mhcflurry.csv \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        \$(mhcflurry-predict --version)
+    END_VERSIONS
     """
 
     stub:
-    def args       = task.ext.args ?: ''
-    def prefix     = task.ext.prefix ?: meta.sample
+    def args   = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: meta.sample
     """
-    touch ${prefix}_predicted_mhcflurry.tsv
+    touch ${prefix}_predicted_mhcflurry.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py
@@ -13,10 +13,12 @@
     datefmt="%Y-%m-%d %H:%M:%S",
 )
 class MinLength(Enum):
+    MHCFLURRY = 5
     NETMHCPAN = 8
     NETMHCIIPAN = 8
 
 class MaxLength(Enum):
+    MHCFLURRY = 15
     NETMHCPAN = 14
     NETMHCIIPAN = 25
 
@@ -27,13 +29,14 @@ class MaxNumberOfAlleles(Enum):
 
 class Arguments:
     """
-    Parses the argments, including the ones coming from $task.ext.args.
+    Parses the arguments, including the ones coming from $task.ext.args.
     """
 
     def __init__(self) -> None:
         self.input = "$tsv"
         self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
         self.mhc_class = "$meta.mhc_class"
+        self.alleles = "$meta.alleles".split(";")
         self.tools = "$params.tools"
         self.min_peptide_length_classI = int("$params.min_peptide_length_classI")
         self.max_peptide_length_classI = int("$params.max_peptide_length_classI")
@@ -97,31 +100,37 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
 def main():
     args = Arguments()
 
-    df = pd.read_csv(args.input, sep="\t")
-    len_df = len(df)
-    logging.info(f"Reading in file with {len(df)} peptides..")
+    df_input = pd.read_csv(args.input, sep="\t")
+    logging.info(f"Reading in file with {len(df_input)} peptides..")
 
     # Filter peptides based on user-defined length
     if args.mhc_class == "I":
-        df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
+        df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
     else:
-        df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
-
-	# Filter peptides based on tool length boundaries and adjust input format
-    if "netmhcpan" in args.tools and args.mhc_class == "I":
-        logging.info("Input for NetMHCpan detected. Parsing input..")
-        df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
-        df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)
-
-    if "netmhciipan" in args.tools and args.mhc_class == "II":
-        logging.info("Input for NetMHCIIpan detected. Parsing input..")
-        df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
-        df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
+        df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
 
     if len(df) == 0:
         raise ValueError("No peptides left after applying length filters! Aborting..")
-    else:
-        logging.info(f"{len(df)} peptides post-filtering will be predicted..")
+
+    # Filter peptides based on tool length boundaries and adjust input format
+    if "mhcflurry" in args.tools and args.mhc_class == "I":
+        df_mhcflurry = df[df["sequence"].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
+        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
+        # Get every combination of sequence and allele and write them to csv with columns sequence and allele
+        df_mhcflurry['allele'] = [args.alleles] * len(df_mhcflurry)
+        df_mhcflurry = df_mhcflurry.explode('allele').reset_index(drop=True)
+        df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True)
+        df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False)
+
+    if "netmhcpan" in args.tools and args.mhc_class == "I":
+        df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
+        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..")
+        df_netmhcpan[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)
+
+    elif "netmhciipan" in args.tools and args.mhc_class == "II":
+        df_netmhciipan = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
+        logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
+        df_netmhciipan[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)
 
     # Parse versions
     versions_this_module = {}

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -79,18 +79,18 @@
                 },
                 "max_peptide_length_classI": {
                     "type": "integer",
-                    "default": 11,
+                    "default": 14,
                     "description": "Specifies the maximum peptide length.",
                     "help_text": "Specifies the maximum peptide length (not applied when `--peptides` is specified). Default: MHC class I: 11 aa, MHC class II: 16 aa"
                 },
                 "min_peptide_length_classII": {
                     "type": "integer",
-                    "default": 15,
+                    "default": 8,
                     "description": "Specifies the minimum peptide length for MHC class II peptides."
                 },
                 "max_peptide_length_classII": {
                     "type": "integer",
-                    "default": 16,
+                    "default": 25,
                     "description": "Specifies the maximum peptide length for MHC class II peptides."
                 },
                 "tools": {

diff --git a/subworkflows/local/mhc_binding_prediction.nf b/subworkflows/local/mhc_binding_prediction.nf
@@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION {
                         return [meta, file]
                     }
             .set{ ch_prediction_input }
-        ch_prediction_input.netmhciipan.view()
+        ch_prediction_input.mhcflurry.view()
         SYFPEITHI ( ch_prediction_input.syfpeithi )
         ch_versions = ch_versions.mix(SYFPEITHI.out.versions)
         ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted)