Skip to content

Commit

Permalink
Merge pull request #268 from jonasscheid/handle-variants
Browse files Browse the repository at this point in the history
Refactor variant prediction
  • Loading branch information
jonasscheid authored Feb 3, 2025
2 parents 048e86d + 5781a91 commit 69080c4
Show file tree
Hide file tree
Showing 56 changed files with 1,214 additions and 755 deletions.
23 changes: 1 addition & 22 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,7 @@
"meta": ["sample"]
},
"alleles": {
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.txt$"
},
{
"type": "string",
"pattern": "^([A-E]{1}[*][0-9]{2}[:][0-9]{2})(;[A-E]{1}[*][0-9]{2}[:][0-9]{2})*$"
},
{
"type": "string",
"pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;(DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})*$"
},
{
"type": "string",
"pattern": "^((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2})(;((DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}[-](DR|DP|DQ)[AB][0-9][*][0-9]{2}[:][0-9]{2}))*$"
},
{
"type": "string",
"pattern": "^[H][-][2][-][A-Za-z]{2,3}$"
}
],
"type": "string",
"errorMessage": "Alleles must be provided as string or file with extension '.txt'. Please check the documentation for more information.",
"meta": ["alleles"]
},
Expand Down
415 changes: 166 additions & 249 deletions bin/epaa.py

Large diffs are not rendered by default.

12 changes: 5 additions & 7 deletions bin/split_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import math
from pathlib import Path

def split_peptides(input_file, output_base, min_size, max_chunks):
def split_peptides(input_file, min_size, max_chunks):
"""Splits the peptide input file into smaller chunks in a single pass."""
input_path = Path(input_file)
output_base = Path(output_base)

with input_path.open("r") as infile:
lines = infile.readlines() # Read all lines into memory
Expand All @@ -24,11 +23,11 @@ def split_peptides(input_file, output_base, min_size, max_chunks):
chunk_size = max(min_size, math.ceil(total_size / num_chunks))

for chunk_idx in range(num_chunks):
chunk_file = output_base.with_name(f"{output_base.stem}_chunk_{chunk_idx}.tsv")
start = chunk_idx * chunk_size
end = start + chunk_size

with chunk_file.open("w") as outfile:
# Some upstream processes insert dots in filename (e.g. snpsift split -> variant.chr8.tsv), which can cause downstream issues
outfile_name = f"{input_path.stem.replace('.','_')}_chunk_{chunk_idx}.tsv"
with open(outfile_name, "w") as outfile:
outfile.write(header)
outfile.writelines(data_lines[start:end])

Expand All @@ -38,12 +37,11 @@ def split_peptides(input_file, output_base, min_size, max_chunks):
def main():
parser = argparse.ArgumentParser(description="Split a peptide file into smaller chunks.")
parser.add_argument("-i", "--input", required=True, help="Input file containing peptides.")
parser.add_argument("-o", "--output_base", required=True, help="Base filename for output files.")
parser.add_argument("--min_size", type=int, required=True, help="Minimum peptides per file.")
parser.add_argument("--max_chunks", type=int, required=True, help="Maximum number of chunks.")

args = parser.parse_args()
split_peptides(args.input, args.output_base, args.min_size, args.max_chunks)
split_peptides(args.input, args.min_size, args.max_chunks)

if __name__ == "__main__":
main()
31 changes: 19 additions & 12 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,30 @@ process {
}

withName: SNPSIFT_SPLIT {
publishDir = [
path: { "${params.outdir}/split_input/${meta.sample}" },
mode: params.publish_dir_mode
]
publishDir = [ enabled: false ]
}

withName: EPYTOPE_PEPTIDE_PREDICTION_VAR {
// Argument list needs to end with --somatic_mutation
withName: EPYTOPE_VARIANT_PREDICTION {
ext.prefix = {"${vcf.baseName}"}
ext.args = [
genome_reference != 'grch37' & genome_reference != 'grch38' ? "--genome_reference '${genome_reference}'" : '',
genome_reference == 'grch37' ? "--genome_reference 'https://grch37.ensembl.org/'" : '',
genome_reference == 'grch38' ? "--genome_reference 'https://www.ensembl.org'" : '',
'--somatic_mutation'
genome_reference != 'grch37' & genome_reference != 'grch38' ? "--genome_reference '${genome_reference}'" : '',
genome_reference == 'grch37' ? "--genome_reference 'https://grch37.ensembl.org/'" : '',
genome_reference == 'grch38' ? "--genome_reference 'https://www.ensembl.org'" : '',
params.proteome_reference ? "--proteome_reference ${params.proteome_reference}" : "",
params.fasta_output ? "--fasta_output" : "",
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/split_predictions/${meta.sample}" },
mode: params.publish_dir_mode
path: { "${params.outdir}/epytope" },
mode: params.publish_dir_mode,
pattern: '*.tsv'
]
}
withName: CAT_FASTA {
ext.prefix = {"${meta.sample}"}
publishDir = [
path: { "${params.outdir}/epytope" },
mode: params.publish_dir_mode,
pattern: '*.fasta'
]
}

Expand Down
5 changes: 4 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,8 @@ params {

// Input data
input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'

tools = 'mhcflurry'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
}
7 changes: 6 additions & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ params {

// Input data for full size test
input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_full_test.csv'
tools = 'syfpeithi,mhcflurry,mhcnuggets,mhcnuggetsii'
tools = 'mhcflurry,mhcnuggets,mhcnuggetsii'
//TODO: Add netmhcpan and netmhciipan
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
min_peptide_length_classII = 14
max_peptide_length_classII = 15
}
5 changes: 5 additions & 0 deletions conf/test_grch38.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,9 @@ params {

input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'
genome_reference = 'grch38'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
min_peptide_length_classII = 14
max_peptide_length_classII = 15
}
3 changes: 3 additions & 0 deletions conf/test_mhcflurry.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,7 @@ params {

input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_variants.csv'
tools = 'mhcflurry'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
}
5 changes: 5 additions & 0 deletions conf/test_mhcnuggets.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,9 @@ params {

input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_variants_class1_and_2.csv'
tools = 'mhcnuggets,mhcnuggetsii'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
min_peptide_length_classII = 14
max_peptide_length_classII = 15
}
3 changes: 3 additions & 0 deletions conf/test_netmhciipan.config
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ params {
input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_peptides_class2.csv'
tools = 'netmhciipan'
netmhciipan_path = './non-free/netmhciipan.tar.gz'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classII = 14
max_peptide_length_classII = 15
}
3 changes: 3 additions & 0 deletions conf/test_netmhcpan.config
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ params {
input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv'
tools = 'netmhcpan'
netmhcpan_path = './non-free/netmhcpan.tar.gz'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
}
3 changes: 3 additions & 0 deletions conf/test_peptides.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,7 @@ params {

input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_peptides.csv'
tools = 'mhcflurry,mhcnuggets'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
}
6 changes: 5 additions & 1 deletion conf/test_peptides_h2.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,9 @@ params {
config_profile_description = 'Peptide list based test profile for mouse alleles by MHCnuggets'

input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_peptides_mouse.csv'
tools = 'mhcnuggets-class-1'
// TODO: test with multiple tools
tools = 'mhcflurry,mhcnuggets'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
}
6 changes: 6 additions & 0 deletions conf/test_proteins.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,10 @@ params {
config_profile_description = 'Protein input test profile'

input = params.pipelines_testdata_base_path + 'epitopeprediction/testdata/sample_sheets/sample_sheet_proteins.csv'
tools = 'mhcflurry'
// Reduce number of possible peptide lengths to speed up test
min_peptide_length_classI = 9
max_peptide_length_classI = 10
min_peptide_length_classII = 14
max_peptide_length_classII = 15
}
12 changes: 11 additions & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,30 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"cat/cat": {
"branch": "master",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
"csvtk/concat": {
"branch": "master",
"git_sha": "aa5c23023134cf2d8b75a95d53557890e40261b9",
"installed_by": ["modules"]
},
"gunzip": {
"branch": "master",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"git_sha": "ce35ce92566b3328b405253543b9b2b4d4e5f4f7",
"installed_by": ["modules"]
},
"multiqc": {
"branch": "master",
"git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
"installed_by": ["modules"]
},
"snpsift/split": {
"branch": "master",
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
}
}
},
Expand Down
48 changes: 0 additions & 48 deletions modules/local/cat_files.nf

This file was deleted.

Loading

0 comments on commit 69080c4

Please sign in to comment.