Skip to content

Commit

Permalink
Merge pull request #204 from luisas/multiple_sequences_as_input
Browse files Browse the repository at this point in the history
Multiple sequences as input and restructuring of output
  • Loading branch information
luisas authored Nov 26, 2024
2 parents 933b858 + 7dc6ce0 commit f917cc7
Show file tree
Hide file tree
Showing 11 changed files with 152 additions and 25 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:
- "test_colabfold_webserver"
- "test_colabfold_download"
- "test_esmfold"
- "test_split_fasta"
isMaster:
- ${{ github.base_ref == 'master' }}
# Exclude conda and singularity on dev
Expand Down
2 changes: 1 addition & 1 deletion assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sequence,fasta
id,fasta
T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta
T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta
32 changes: 25 additions & 7 deletions conf/modules_alphafold2.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,18 @@ if (params.alphafold2_mode == 'standard') {
params.max_template_date ? "--max_template_date ${params.max_template_date}" : ''
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
[
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
],
[
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" },
mode: 'copy',
saveAs: { "${meta.id}.pdb" },
pattern: '*_alphafold2.pdb'
]
]
}
}
Expand All @@ -54,7 +63,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') {
withName: 'RUN_ALPHAFOLD2_MSA' {
ext.args = params.max_template_date ? "--max_template_date ${params.max_template_date}" : ''
publishDir = [
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand All @@ -64,9 +73,18 @@ if (params.alphafold2_mode == 'split_msa_prediction') {
if(params.use_gpu) { accelerator = 1 }
ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false'
publishDir = [
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
[
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
],
[
path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" },
mode: 'copy',
saveAs: { "${meta.id}.pdb" },
pattern: '*_alphafold2.pdb'
]
]
}
}
Expand Down
32 changes: 24 additions & 8 deletions conf/modules_colabfold.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,18 @@ if (params.colabfold_server == 'webserver') {
params.host_url ? "--host-url ${params.host_url}" : ''
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
[
path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
],
[
path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" },
mode: 'copy',
saveAs: { "${meta.id}.pdb" },
pattern: '*_relaxed_rank_001*.pdb'
]
]
}
}
Expand Down Expand Up @@ -67,10 +75,18 @@ if (params.colabfold_server == 'local') {
params.use_templates ? '--templates' : ''
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
[
path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
],
[
path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" },
mode: 'copy',
saveAs: { "${meta.id}.pdb" },
pattern: '*_relaxed_rank_001*.pdb'
],
]
}
}
Expand Down
10 changes: 9 additions & 1 deletion conf/modules_esmfold.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,19 @@ process {
withName: 'RUN_ESMFOLD' {
ext.args = {params.use_gpu ? '' : '--cpu-only'}
publishDir = [
path: { "${params.outdir}/esmfold" },
[
path: { "${params.outdir}/esmfold/default" },
mode: 'copy',
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: '*.*'
],
[
path: { "${params.outdir}/esmfold/default/top_ranked_structures" },
mode: 'copy',
saveAs: { "${meta.id}.pdb" },
pattern: '*.pdb'
]
]
}

withName: 'NFCORE_PROTEINFOLD:ESMFOLD:MULTIQC' {
Expand Down
38 changes: 38 additions & 0 deletions conf/test_split_fasta.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/proteinfold -profile test_colabfold_local,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

stubRun = true

// Limit resources so that this can run on GitHub Actions
process {
resourceLimits = [
cpus: 4,
memory: '15.GB',
time: '1.h'
]
}

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Input data to test colabfold with the colabfold webserver analysis
mode = 'colabfold'
colabfold_server = 'local'
split_fasta = true
colabfold_db = "${projectDir}/assets/dummy_db_dir"
input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv'
}

process {
withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' {
container = 'biocontainers/gawk:5.1.0'
}
}
15 changes: 7 additions & 8 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ The directories listed below will be created in the output directory after the p
<details markdown="1">
<summary>Output files</summary>

- `AlphaFold2/`
- `<SEQUENCE NAME>/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings
- `<SEQUENCE NAME>.alphafold.pdb` that is the structure with the highest pLDDT score (ranked first)
- `<SEQUENCE NAME>_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models
- `alphafold2/standard/` or `alphafold2/split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models.
- `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
- `DBs/` that contains symbolic links to the downloaded database and parameter files

</details>
Expand Down Expand Up @@ -91,7 +89,8 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
<details markdown="1">
<summary>Output files</summary>

- `colabfold/webserver/` or `colabfold/local/` based on the selected mode that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs and scores, prediction metadata, logs and section timings
- `colabfold/webserver/` or `colabfold/local/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models.
- `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
- `DBs/` that contains symbolic links to the downloaded database and parameter files

</details>
Expand All @@ -115,9 +114,9 @@ Below you can find some indicative examples of the output images produced by Col
<details markdown="1">
<summary>Output files</summary>

- `esmfold/`
- `<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score (ranked first)
- `<SEQUENCE NAME>_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models
- `esmfold/default`
contains the predicted structures. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the predicted models.
- `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
- `DBs/` that contains symbolic links to the downloaded database and parameter files

</details>
Expand Down
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ The samplesheet can have as many columns as you desire, however, there is a stri

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the `--split_fasta` parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet.

## Running the pipeline

The typical commands for running the pipeline on AlphaFold2, Colabfold and ESMFold modes are shown below.
Expand Down
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ params {
input = null
mode = 'alphafold2' // {alphafold2, colabfold, esmfold}
use_gpu = false
split_fasta = false

// Alphafold2 parameters
alphafold2_mode = "standard"
Expand Down Expand Up @@ -240,6 +241,7 @@ profiles {
test_colabfold_webserver { includeConfig 'conf/test_colabfold_webserver.config' }
test_colabfold_download { includeConfig 'conf/test_colabfold_download.config' }
test_esmfold { includeConfig 'conf/test_esmfold.config' }
test_split_fasta { includeConfig 'conf/test_split_fasta.config' }
test_full { includeConfig 'conf/test_full.config' }
test_full_alphafold2_standard { includeConfig 'conf/test_full.config' }
test_full_alphafold2_split { includeConfig 'conf/test_full_alphafold_split.config' }
Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
"description": "Run on CPUs (default) or GPUs",
"fa_icon": "fas fa-microchip"
},
"split_fasta": {
"type": "boolean",
"description": "Split input multi-fasta file in separated fasta files each of them containing one sequence to be folded",
"fa_icon": "fas fa-microchip"
},
"email": {
"type": "string",
"description": "Email address for completion summary.",
Expand Down
38 changes: 38 additions & 0 deletions subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ workflow PIPELINE_INITIALISATION {
// Create channel from input file provided through params.input
//
ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json"))
if (params.split_fasta) {
// TODO: here we have to validate that the ids are unique and valid as an extra step
// since it is not done with the samplesheet schema (they are all in the same file)
ch_samplesheet.map { meta, fasta ->
validateFasta(fasta)
}

// Split the fasta file into individual files for each sequence
ch_samplesheet
.map { meta,fasta -> fasta }
.splitFasta( record: [header: true, sequence: true] )
.collectFile { item ->
[ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ]
}
.map {
file -> [[id: file.baseName], file]
}
.set { ch_samplesheet }
}

emit:
samplesheet = ch_samplesheet
Expand Down Expand Up @@ -214,3 +233,22 @@ def methodsDescriptionText(mqc_methods_yaml) {
return description_html.toString()
}

def cleanHeader(header) {
return header.replaceAll(" ", "_").replaceAll(",", "").replaceAll(";","")
}

def validateFasta(fasta) {
// extract headers
def headers = fasta.findAll { it.startsWith('>') }
// if headers are not unique, throw an error
if (headers.size() != headers.unique().size()) {
throw new Exception("Invalid FASTA file. The headers are not unique.")
}
// check headers that are malformed
headers.each { header ->
if (header =~ /[ \t;,]/) {
// warn user that the header contains special characters
log.warn "The header ${header} contains special characters. They have been automatically removed."
}
}
}

0 comments on commit f917cc7

Please sign in to comment.