Merge pull request #204 from luisas/multiple_sequences_as_input

Multiple sequences as input and restructuring of output
nf-core · Nov 26, 2024 · f917cc7 · f917cc7
2 parents 933b858 + 7dc6ce0
commit f917cc7
Show file tree

Hide file tree

Showing 11 changed files with 152 additions and 25 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,6 +43,7 @@ jobs:
           - "test_colabfold_webserver"
           - "test_colabfold_download"
           - "test_esmfold"
+          - "test_split_fasta"
         isMaster:
           - ${{ github.base_ref == 'master' }}
         # Exclude conda and singularity on dev

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sequence,fasta
+id,fasta
 T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta
 T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta
diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config
@@ -40,9 +40,18 @@ if (params.alphafold2_mode == 'standard') {
                 params.max_template_date ? "--max_template_date ${params.max_template_date}" : ''
             ].join(' ').trim()
             publishDir = [
-                path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
-                mode: 'copy',
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                [
+                    path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
+                    mode: 'copy',
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                    pattern: '*.*'
+                ],
+                [
+                    path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" },
+                    mode: 'copy',
+                    saveAs: { "${meta.id}.pdb" },
+                    pattern: '*_alphafold2.pdb'
+                ]
             ]
         }
     }
@@ -54,7 +63,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') {
         withName: 'RUN_ALPHAFOLD2_MSA' {
             ext.args =  params.max_template_date ? "--max_template_date ${params.max_template_date}" : ''
             publishDir = [
-                path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
+                path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" },
                 mode: 'copy',
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
@@ -64,9 +73,18 @@ if (params.alphafold2_mode == 'split_msa_prediction') {
             if(params.use_gpu) { accelerator = 1 }
             ext.args   = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false'
             publishDir = [
-                path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
-                mode: 'copy',
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+                [
+                    path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" },
+                    mode: 'copy',
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                    pattern: '*.*'
+                ],
+                [
+                    path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" },
+                    mode: 'copy',
+                    saveAs: { "${meta.id}.pdb" },
+                    pattern: '*_alphafold2.pdb'
+                ]
             ]
         }
     }

diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config
@@ -30,10 +30,18 @@ if (params.colabfold_server == 'webserver') {
                 params.host_url ? "--host-url ${params.host_url}" : ''
             ].join(' ').trim()
             publishDir = [
-                path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
-                mode: 'copy',
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-                pattern: '*.*'
+                [
+                    path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
+                    mode: 'copy',
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                    pattern: '*.*'
+                ],
+                [
+                    path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" },
+                    mode: 'copy',
+                    saveAs: { "${meta.id}.pdb" },
+                    pattern: '*_relaxed_rank_001*.pdb'
+                ]
             ]
         }
     }
@@ -67,10 +75,18 @@ if (params.colabfold_server == 'local') {
                 params.use_templates ? '--templates' : ''
             ].join(' ').trim()
             publishDir = [
-                path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
-                mode: 'copy',
-                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-                pattern: '*.*'
+                [
+                    path: { "${params.outdir}/colabfold/${params.colabfold_server}" },
+                    mode: 'copy',
+                    saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                    pattern: '*.*'
+                ],
+                [
+                    path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" },
+                    mode: 'copy',
+                    saveAs: { "${meta.id}.pdb" },
+                    pattern: '*_relaxed_rank_001*.pdb'
+                ],
             ]
         }
     }

diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config
@@ -14,11 +14,19 @@ process {
     withName: 'RUN_ESMFOLD' {
         ext.args = {params.use_gpu ? '' : '--cpu-only'}
         publishDir = [
-                path: { "${params.outdir}/esmfold" },
+            [
+                path: { "${params.outdir}/esmfold/default" },
                 mode: 'copy',
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
                 pattern: '*.*'
+            ],
+            [
+                path: { "${params.outdir}/esmfold/default/top_ranked_structures" },
+                mode: 'copy',
+                saveAs: { "${meta.id}.pdb" },
+                pattern: '*.pdb'
             ]
+        ]
     }
 
     withName: 'NFCORE_PROTEINFOLD:ESMFOLD:MULTIQC' {

diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config
@@ -0,0 +1,38 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+    Use as follows:
+        nextflow run nf-core/proteinfold -profile test_colabfold_local,<docker/singularity> --outdir <OUTDIR>
+----------------------------------------------------------------------------------------
+*/
+
+stubRun = true
+
+// Limit resources so that this can run on GitHub Actions
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
+}
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Input data to test colabfold with the colabfold webserver analysis
+    mode             = 'colabfold'
+    colabfold_server = 'local'
+    split_fasta      = true
+    colabfold_db     = "${projectDir}/assets/dummy_db_dir"
+    input            = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv'
+}
+
+process {
+    withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' {
+        container = 'biocontainers/gawk:5.1.0'
+    }
+}
diff --git a/docs/output.md b/docs/output.md
@@ -23,10 +23,8 @@ The directories listed below will be created in the output directory after the p
 <details markdown="1">
 <summary>Output files</summary>
 
-- `AlphaFold2/`
-  - `<SEQUENCE NAME>/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings
-  - `<SEQUENCE NAME>.alphafold.pdb` that is the structure with the highest pLDDT score (ranked first)
-  - `<SEQUENCE NAME>_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models
+- `alphafold2/standard/` or `alphafold2/split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models.
+  - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
 - `DBs/` that contains symbolic links to the downloaded database and parameter files
 
 </details>
@@ -91,7 +89,8 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
 <details markdown="1">
 <summary>Output files</summary>
 
-- `colabfold/webserver/` or `colabfold/local/` based on the selected mode that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs and scores, prediction metadata, logs and section timings
+- `colabfold/webserver/` or `colabfold/local/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models.
+  - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
 - `DBs/` that contains symbolic links to the downloaded database and parameter files
 
 </details>
@@ -115,9 +114,9 @@ Below you can find some indicative examples of the output images produced by Col
 <details markdown="1">
 <summary>Output files</summary>
 
-- `esmfold/`
-  - `<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score (ranked first)
-  - `<SEQUENCE NAME>_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models
+- `esmfold/default`
+  contains the predicted structures. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the predicted models.
+  - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first)
 - `DBs/` that contains symbolic links to the downloaded database and parameter files
 
 </details>

diff --git a/docs/usage.md b/docs/usage.md
@@ -35,6 +35,8 @@ The samplesheet can have as many columns as you desire, however, there is a stri
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
+Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the `--split_fasta` parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet.
+
 ## Running the pipeline
 
 The typical commands for running the pipeline on AlphaFold2, Colabfold and ESMFold modes are shown below.

diff --git a/nextflow.config b/nextflow.config
@@ -13,6 +13,7 @@ params {
     input                       = null
     mode                        = 'alphafold2' // {alphafold2, colabfold, esmfold}
     use_gpu                     = false
+    split_fasta                 = false
 
     // Alphafold2 parameters
     alphafold2_mode             = "standard"
@@ -240,6 +241,7 @@ profiles {
     test_colabfold_webserver      { includeConfig 'conf/test_colabfold_webserver.config'               }
     test_colabfold_download       { includeConfig 'conf/test_colabfold_download.config'                }
     test_esmfold                  { includeConfig 'conf/test_esmfold.config'                           }
+    test_split_fasta              { includeConfig 'conf/test_split_fasta.config'                       }
     test_full                     { includeConfig 'conf/test_full.config'                              }
     test_full_alphafold2_standard { includeConfig 'conf/test_full.config'                              }
     test_full_alphafold2_split    { includeConfig 'conf/test_full_alphafold_split.config'              }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -40,6 +40,11 @@
                     "description": "Run on CPUs (default) or GPUs",
                     "fa_icon": "fas fa-microchip"
                 },
+                "split_fasta": {
+                    "type": "boolean",
+                    "description": "Split input multi-fasta file in separated fasta files each of them containing one sequence to be folded",
+                    "fa_icon": "fas fa-microchip"
+                },
                 "email": {
                     "type": "string",
                     "description": "Email address for completion summary.",

diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf
@@ -66,6 +66,25 @@ workflow PIPELINE_INITIALISATION {
     // Create channel from input file provided through params.input
     //
     ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json"))
+    if (params.split_fasta) {
+        // TODO: here we have to validate that the ids are unique and valid as an extra step
+        // since it is not done with the samplesheet schema (they are all in the same file)
+        ch_samplesheet.map { meta, fasta ->
+            validateFasta(fasta)
+        }
+
+        // Split the fasta file into individual files for each sequence
+        ch_samplesheet
+            .map { meta,fasta -> fasta }
+            .splitFasta( record: [header: true, sequence: true] )
+            .collectFile { item ->
+                [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ]
+            }
+            .map {
+                file -> [[id: file.baseName], file]
+            }
+            .set { ch_samplesheet }
+    }
 
     emit:
     samplesheet = ch_samplesheet
@@ -214,3 +233,22 @@ def methodsDescriptionText(mqc_methods_yaml) {
     return description_html.toString()
 }
 
+def cleanHeader(header) {
+    return header.replaceAll(" ", "_").replaceAll(",", "").replaceAll(";","")
+}
+
+def validateFasta(fasta) {
+    // extract headers
+    def headers = fasta.findAll { it.startsWith('>') }
+    // if headers are not unique, throw an error
+    if (headers.size() != headers.unique().size()) {
+        throw new Exception("Invalid FASTA file. The headers are not unique.")
+    }
+    // check headers that are malformed
+    headers.each { header ->
+        if (header =~ /[ \t;,]/) {
+            // warn user that the header contains special characters
+            log.warn "The header ${header} contains special characters. They have been automatically removed."
+        }
+    }
+}