Skip to content

Commit

Permalink
Merge pull request #262 from jonasscheid/handle-supported-alleles
Browse files Browse the repository at this point in the history
Add supported_alleles.json to handle supported alleles per predictor
  • Loading branch information
jonasscheid authored Jan 29, 2025
2 parents c1efc39 + 97d7b63 commit eef4be2
Show file tree
Hide file tree
Showing 13 changed files with 163 additions and 79 deletions.
1 change: 1 addition & 0 deletions assets/supported_alleles.json

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions modules/local/merge_predictions/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ process MERGE_PREDICTIONS {
label 'process_single'
tag "${meta.sample}"

conda "bioconda::mhcgnomes=1.8.4"
conda "bioconda::mhcgnomes=1.8.6"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.4--pyh7cba7a3_0' :
'quay.io/biocontainers/mhcgnomes:1.8.4--pyh7cba7a3_0' }"
'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.6--pyh7cba7a3_0' :
'biocontainers/mhcgnomes:1.8.6--pyh7cba7a3_0' }"

input:
tuple val(meta), path(prediction_files)
Expand Down
15 changes: 9 additions & 6 deletions modules/local/merge_predictions/templates/merge_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import math

import pandas as pd
import mhcgnomes

# Create logger object with date and time
import logging
Expand Down Expand Up @@ -124,7 +125,7 @@ def _format_prediction_result(self):
self.predictor = 'mhcflurry'
return self._format_mhcflurry_prediction()
elif 'mhcnuggets' in self.file_path:
self.predictor = 'mhcnuggets'
self.predictor = 'mhcnuggetsii' if 'mhcnuggetsii' in self.file_path else 'mhcnuggets'
return self._format_mhcnuggets_prediction()
elif 'netmhcpan' in self.file_path:
self.predictor = 'netmhcpan'
Expand Down Expand Up @@ -152,7 +153,7 @@ def _format_mhcflurry_prediction(self) -> pd.DataFrame:
df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "rank"}, inplace=True)
df = df[['sequence', 'allele', 'rank', 'BA']]
df["binder"] = df["rank"] <= PredictorBindingThreshold.MHCFLURRY.value
df["predictor"] = "mhcflurry"
df["predictor"] = self.predictor

return df

Expand All @@ -166,7 +167,7 @@ def _format_mhcnuggets_prediction(self) -> pd.DataFrame:
df = df[['sequence', 'allele', 'rank', 'BA']]
# Use IC50 < 500 as threshold since mhcnuggets provides a different ranking compared to other predictors
df["binder"] = df["BA"] >= PredictorBindingThreshold.MHCNUGGETS.value
df["predictor"] = "mhcnuggets"
df["predictor"] = self.predictor

return df

Expand Down Expand Up @@ -195,7 +196,7 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['rank'] <= self.threshold
df_pivot['predictor'] = 'netmhcpan'
df_pivot['predictor'] = self.predictor
df_pivot.index.name = ''

return df_pivot
Expand Down Expand Up @@ -228,7 +229,7 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame:
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
df_pivot['predictor'] = 'netmhciipan'
df_pivot['predictor'] = self.predictor
df_pivot.index.name = ''

return df_pivot
Expand All @@ -245,11 +246,13 @@ def main():
output_df.append(result.prediction_df)

output_df = pd.concat(output_df)
# Normalize allele names and write to file
output_df['allele'] = output_df['allele'].apply(lambda x : mhcgnomes.parse(x).to_string())
output_df.to_csv(f'{args.prefix}_predictions.tsv', sep='\t', index=False)

# Parse versions
versions_this_module = {}
versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])
versions_this_module["${task.process}"] = Version.get_versions([argparse, pd, mhcgnomes])
with open("versions.yml", "w") as f:
f.write(Version.format_yaml_like(versions_this_module))

Expand Down
4 changes: 2 additions & 2 deletions modules/local/mhcflurry.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ process MHCFLURRY {

output:
tuple val(meta), path("*.csv"), emit: predicted
path "versions.yml", emit: versions
path "versions.yml" , emit: versions

script:
if (meta.mhc_class == "II") {
error("MHCflurry prediction of ${meta.sample} is not possible with MHC class II!")
}
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
def prefix = task.ext.prefix ?: "${meta.sample}"

"""
mhcflurry-downloads fetch models_class1_presentation
Expand Down
4 changes: 2 additions & 2 deletions modules/local/mhcnuggets/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process MHCNUGGETS {
tuple val(meta), path(tsv)

output:
tuple val(meta), path("*_predicted_mhcnuggets.csv"), emit: predicted
tuple val(meta), path("*{_predicted_mhcnuggets.csv,_predicted_mhcnuggetsii.csv}"), emit: predicted
path "versions.yml" , emit: versions

script:
Expand All @@ -20,7 +20,7 @@ process MHCNUGGETS {

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
def prefix = task.ext.prefix ?: "${meta.sample}"
"""
touch ${prefix}_predicted_mhcnuggets.tsv
Expand Down
9 changes: 5 additions & 4 deletions modules/local/mhcnuggets/templates/mhcnuggets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self) -> None:
self.input = "$tsv"
self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
self.mhc_class = "$meta.mhc_class"
self.alleles = "$meta.alleles".split(";")
self.alleles = "$meta.alleles_supported".split(";")
self.parse_ext_args("$task.ext.args")

def parse_ext_args(self, args_string: str) -> None:
Expand Down Expand Up @@ -87,7 +87,7 @@ def main():
# Predict and load written tsv file
predicted_df = []
for allele in args.alleles:
mhcnuggets_allele = 'HLA-'+allele.replace('*','')
mhcnuggets_allele = allele.replace('*','')
predict(class_=args.mhc_class, peptides_path = args.input, mhc=mhcnuggets_allele,
output=f'{args.prefix}_{allele}.csv', rank_output=True)

Expand All @@ -97,7 +97,8 @@ def main():

# Append dataframe per allele and write file
predicted_df = pd.concat(predicted_df)
predicted_df.to_csv(f'{args.prefix}_predicted_mhcnuggets.csv', index=False)
filename_out = f'{args.prefix}_predicted_mhcnuggets.csv' if args.mhc_class == 'I' else f'{args.prefix}_predicted_mhcnuggetsii.csv'
predicted_df.to_csv(filename_out, index=False)

# Parse versions
versions_this_module = {}
Expand All @@ -106,7 +107,7 @@ def main():
f.write(Version.format_yaml_like(versions_this_module))
# No __version__ dunder or similar available, need to hardcode version
f.write('mhcnuggets: 2.4.0')


if __name__ == "__main__":
main()
13 changes: 7 additions & 6 deletions modules/local/netmhciipan.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ process NETMHCIIPAN {
}
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
// Adjust for netMHCIIpan allele format
def alleles = meta.alleles.tokenize(';').collect {
it.contains('DRB') ?
it.replace('*', '_').replace(':', '') :
('HLA-' + it.replace('*', '').replace(':', ''))
}.join(',')
// Adjust for netMHCIIpan allele format (e.g. DRB1_0101, HLA-DPA10103-DPB10101)
def alleles = meta.alleles_supported.tokenize(';')
.collect {
it.contains('DRB') ?
it.replace('*', '_').replace(':', '').replace('HLA-', '') :
it.replace('*', '').replace(':', '').replace('/','-')
}.join(',')

"""
netmhciipan/netMHCIIpan \
Expand Down
3 changes: 1 addition & 2 deletions modules/local/netmhcpan.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ process NETMHCPAN {
}
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
// A*01:217 to HLA-A01:217 for meta.alleles: Add HLA- to the allele names and strip the *.
def alleles = meta.alleles.tokenize(';').collect { 'HLA-' + it.replace('*', '') }.join(',')
def alleles = meta.alleles_supported.tokenize(';').collect { it.replace('*', '') }.join(',')

"""
netmhcpan/netMHCpan \
Expand Down
12 changes: 6 additions & 6 deletions modules/local/prepare_prediction_input/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@ process PREPARE_PREDICTION_INPUT {
label 'process_single'
tag "${meta.sample}"

conda "bioconda::mhcgnomes=1.8.4"
conda "bioconda::mhcgnomes=1.8.6"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.4--pyh7cba7a3_0' :
'quay.io/biocontainers/mhcgnomes:1.8.4--pyh7cba7a3_0' }"
'https://depot.galaxyproject.org/singularity/mhcgnomes:1.8.6--pyh7cba7a3_0' :
'biocontainers/mhcgnomes:1.8.6--pyh7cba7a3_0' }"

input:
tuple val(meta), path(tsv)
path(supported_alleles_json)

output:
tuple val(meta), path("*.{csv,tsv}"), emit: prepared
path "versions.yml", emit: versions
tuple val(meta), path("*.json"), path("*.{csv,tsv}"), emit: prepared
path "versions.yml" , emit: versions

script:

template "prepare_prediction_input.py"

stub:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
import shlex
from enum import Enum
import logging
import json

import pandas as pd
import mhcgnomes

logging.basicConfig(
level=logging.INFO,
Expand Down Expand Up @@ -37,14 +39,16 @@ class Arguments:

def __init__(self) -> None:
self.input = "$tsv"
self.supported_alleles_json = "$supported_alleles_json"
self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id"
self.mhc_class = "$meta.mhc_class"
self.alleles = "$meta.alleles".split(";")
self.tools = "$params.tools"
self.alleles = "$meta.alleles"
self.tools = "$params.tools".split(',')
self.min_peptide_length_classI = int("$params.min_peptide_length_classI")
self.max_peptide_length_classI = int("$params.max_peptide_length_classI")
self.min_peptide_length_classII = int("$params.min_peptide_length_classII")
self.max_peptide_length_classII = int("$params.max_peptide_length_classII")
self.peptide_col_name = "sequence"
self.parse_ext_args("$task.ext.args")

def parse_ext_args(self, args_string: str) -> None:
Expand Down Expand Up @@ -100,52 +104,121 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str


class Utils:
def parse_alleles(allele_str: str) -> str:
"""
Function that parses optional txt allele input and
normalizes alleles using mhcgnomes
Args:
alleles (str): File path (one allele per line) or string in required format ("Allele1;Allele2;..")
Returns:
alleles_normalized: mhcgnomes-normalized allele string in format "Allele1;Allele2;.."
"""
if allele_str.endswith(('.txt','.csv','.tsv')):
with open(allele_str, 'r') as f:
alleles = [line.strip() for line in f.readlines()]
else:
alleles = allele_str.split(";")
alleles_normalized = [mhcgnomes.parse(allele).to_string() for allele in alleles]

return alleles_normalized

def keep_supported_alleles(allele_ls: list, tool: str, supported_alleles_tool: dict) -> list:
"""
Function that filters out unsupported alleles of predictor for a given MHC class
Args:
allele_ls (list): List of alleles in mhcgnomes-normalized format
tool (str): Name of predictor tool
supported_alleles (dict): Dictionary with supported alleles or predictor
Returns:
tool_allele_input: List of supported alleles for the given tool
"""
tool_allele_input = [allele for allele in allele_ls if allele in supported_alleles_tool]
if len(tool_allele_input) == 0:
logging.warning(f"No supported alleles for {tool} found. Aborting..")
elif len(tool_allele_input) < len(allele_ls):
logging.warning(f"Provided alleles {allele_ls}, but only {tool_allele_input} are supported by {tool}. Continuing with supported alleles..")

return tool_allele_input


def main():
args = Arguments()

### DEBUGGING INPUT
#args.input = '/mnt/volume/data/epitopeprediction/peptides.tsv'
#args.supported_alleles_json = '/mnt/volume/dev/mhcgnomes/supported_alleles.json'
#args.mhc_class = 'I'
#args.alleles='/mnt/volume/dev/epitopeprediction-2/modules/local/check_supported_alleles/templates/test_alleles.txt'
#args.tools = ['mhcflurry', 'mhcnuggets', 'mhcnuggetsii', 'netmhcpan','netmhciipan']
#args.prefix = "test"
#args.min_peptide_length_classI = 8
#args.max_peptide_length_classI = 12
#args.min_peptide_length_classII = 12
#args.max_peptide_length_classII = 25
#args.peptide_col_name = "sequence"
###

# Parse alleles to uniform format
alleles_normalized = Utils.parse_alleles(args.alleles)
supported_alleles_dict = json.load(open(args.supported_alleles_json))
# Filter alleles based on supported alleles for each tool and write to json
tools_allele_input = {}
for tool in args.tools:
tool_allele_input = Utils.keep_supported_alleles(alleles_normalized, tool, supported_alleles_dict[tool])
tools_allele_input[tool] = ';'.join(tool_allele_input)
with open(f"{args.prefix}_allele_input.json", "w") as f:
json.dump(tools_allele_input, f)

# Parse input file to desired format of tools
df_input = pd.read_csv(args.input, sep="\t")
logging.info(f"Reading in file with {len(df_input)} peptides..")
logging.info(f"Read file with {len(df_input)} peptides.")

# Filter peptides based on user-defined length
if args.mhc_class == "I":
df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
df = df_input[df_input[args.peptide_col_name].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
else:
df = df_input[df_input["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]
df = df_input[df_input[args.peptide_col_name].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]

if len(df) == 0:
raise ValueError("No peptides left after applying length filters! Aborting..")

# Filter peptides based on tool length boundaries and adjust input format
if "mhcflurry" in args.tools and args.mhc_class == "I":
df_mhcflurry = df[df["sequence"].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
logging.info(f"Input for NetMHCpan detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
df_mhcflurry = df[df[args.peptide_col_name].str.len().between(MinLength.MHCFLURRY.value, MaxLength.MHCFLURRY.value)]
logging.info(f"Input for MHCflurry detected. Preparing {len(df_mhcflurry)} peptides for prediction..")
# Get every combination of sequence and allele and write them to csv with columns sequence and allele
df_mhcflurry['allele'] = [args.alleles] * len(df_mhcflurry)
df_mhcflurry['allele'] = [tools_allele_input['mhcflurry'].split(';')] * len(df_mhcflurry)
df_mhcflurry = df_mhcflurry.explode('allele').reset_index(drop=True)
df_mhcflurry.rename(columns={"sequence": "peptide"}, inplace=True)
df_mhcflurry.rename(columns={args.peptide_col_name: "peptide"}, inplace=True)
df_mhcflurry[['peptide','allele']].to_csv(f'{args.prefix}_mhcflurry_input.csv', index=False)

if "mhcnuggets" in args.tools:
if args.mhc_class == "I":
df_mhcnuggets = df[df["sequence"].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSI.value)]
else:
df_mhcnuggets = df[df["sequence"].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSII.value)]
if "mhcnuggets" in args.tools and args.mhc_class == "I":
df_mhcnuggets = df[df[args.peptide_col_name].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSI.value)]
logging.info(f"Input for MHCnuggets detected. Preparing {len(df_mhcnuggets)} peptides for prediction..")
df_mhcnuggets[['sequence']].to_csv(f'{args.prefix}_mhcnuggets_input.tsv', sep="\t", header=False, index=False)

if "netmhcpan" in args.tools and args.mhc_class == "I":
df_netmhcpan = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
if "mhcnuggetsii" in args.tools and args.mhc_class == "II":
df_mhcnuggets_ii = df[df[args.peptide_col_name].str.len().between(MinLength.MHCNUGGETS.value, MaxLength.MHCNUGGETS_CLASSII.value)]
logging.info(f"Input for MHCnuggets II detected. Preparing {len(df_mhcnuggets_ii)} peptides for prediction..")
df_mhcnuggets_ii[['sequence']].to_csv(f'{args.prefix}_mhcnuggetsii_input.tsv', sep="\t", header=False, index=False)

if "netmhcpan-4.1" in args.tools and args.mhc_class == "I":
df_netmhcpan = df[df[args.peptide_col_name].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhcpan)} peptides for prediction..")
df_netmhcpan[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)

elif "netmhciipan" in args.tools and args.mhc_class == "II":
df_netmhciipan = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
logging.info(f"Input for NetMHCpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
elif "netmhciipan-4.3" in args.tools and args.mhc_class == "II":
df_netmhciipan = df[df[args.peptide_col_name].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
logging.info(f"Input for NetMHCIIpan detected. Preparing {len(df_netmhciipan)} peptides for prediction..")
df_netmhciipan[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)

# Parse versions
versions_this_module = {}
versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])
versions_this_module["${task.process}"] = Version.get_versions([argparse, pd, mhcgnomes])
with open("versions.yml", "w") as f:
f.write(Version.format_yaml_like(versions_this_module))

Expand Down
Loading

0 comments on commit eef4be2

Please sign in to comment.