diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 161ca5e8..196a9393 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,6 +43,7 @@ jobs: - "test_colabfold_webserver" - "test_colabfold_download" - "test_esmfold" + - "test_split_fasta" isMaster: - ${{ github.base_ref == 'master' }} # Exclude conda and singularity on dev diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 467fdcf0..b458d604 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sequence,fasta +id,fasta T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index 33b04c38..a12105ab 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -40,9 +40,18 @@ if (params.alphafold2_mode == 'standard') { params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_alphafold2.pdb' + ] ] } } @@ -54,7 +63,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { withName: 'RUN_ALPHAFOLD2_MSA' { ext.args = params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -64,9 +73,18 @@ if (params.alphafold2_mode == 'split_msa_prediction') { if(params.use_gpu) { accelerator = 1 } ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_alphafold2.pdb' + ] ] } } diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 2efcfa01..c37214d3 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -30,10 +30,18 @@ if (params.colabfold_server == 'webserver') { params.host_url ? "--host-url ${params.host_url}" : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_relaxed_rank_001*.pdb' + ] ] } } @@ -67,10 +75,18 @@ if (params.colabfold_server == 'local') { params.use_templates ? '--templates' : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_relaxed_rank_001*.pdb' + ], ] } } diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index d8356924..3468718f 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -14,11 +14,19 @@ process { withName: 'RUN_ESMFOLD' { ext.args = {params.use_gpu ? '' : '--cpu-only'} publishDir = [ - path: { "${params.outdir}/esmfold" }, + [ + path: { "${params.outdir}/esmfold/default" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' + ], + [ + path: { "${params.outdir}/esmfold/default/top_ranked_structures" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*.pdb' ] + ] } withName: 'NFCORE_PROTEINFOLD:ESMFOLD:MULTIQC' { diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config new file mode 100644 index 00000000..a1c3c683 --- /dev/null +++ b/conf/test_split_fasta.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_colabfold_local,<docker/singularity> --outdir <OUTDIR> +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test colabfold with the colabfold webserver analysis + mode = 'colabfold' + colabfold_server = 'local' + split_fasta = true + colabfold_db = "${projectDir}/assets/dummy_db_dir" + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' +} + +process { + withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/docs/output.md b/docs/output.md index 9b9a8fb8..c2350157 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,10 +23,8 @@ The directories listed below will be created in the output directory after the p <details markdown="1"> <summary>Output files</summary> -- `AlphaFold2/` - - `<SEQUENCE NAME>/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings - - `<SEQUENCE NAME>.alphafold.pdb` that is the structure with the highest pLDDT score (ranked first) - - `<SEQUENCE NAME>_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models +- `alphafold2/standard/` or `alphafold2/split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files </details> @@ -91,7 +89,8 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p <details markdown="1"> <summary>Output files</summary> -- `colabfold/webserver/` or `colabfold/local/` based on the selected mode that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs and scores, prediction metadata, logs and section timings +- `colabfold/webserver/` or `colabfold/local/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files </details> @@ -115,9 +114,9 @@ Below you can find some indicative examples of the output images produced by Col <details markdown="1"> <summary>Output files</summary> -- `esmfold/` - - `<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score (ranked first) - - `<SEQUENCE NAME>_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models +- `esmfold/default` + contains the predicted structures. Specifically, `<SEQUENCE NAME>_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the predicted models. + - `top_ranked_structures/<SEQUENCE NAME>.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files </details> diff --git a/docs/usage.md b/docs/usage.md index cc7e0b15..3ac88ecd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,6 +35,8 @@ The samplesheet can have as many columns as you desire, however, there is a stri An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the `--split_fasta` parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet. + ## Running the pipeline The typical commands for running the pipeline on AlphaFold2, Colabfold and ESMFold modes are shown below. diff --git a/nextflow.config b/nextflow.config index d8fc2623..ed874e2c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null mode = 'alphafold2' // {alphafold2, colabfold, esmfold} use_gpu = false + split_fasta = false // Alphafold2 parameters alphafold2_mode = "standard" @@ -240,6 +241,7 @@ profiles { test_colabfold_webserver { includeConfig 'conf/test_colabfold_webserver.config' } test_colabfold_download { includeConfig 'conf/test_colabfold_download.config' } test_esmfold { includeConfig 'conf/test_esmfold.config' } + test_split_fasta { includeConfig 'conf/test_split_fasta.config' } test_full { includeConfig 'conf/test_full.config' } test_full_alphafold2_standard { includeConfig 'conf/test_full.config' } test_full_alphafold2_split { includeConfig 'conf/test_full_alphafold_split.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 313997a8..d8191d6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -40,6 +40,11 @@ "description": "Run on CPUs (default) or GPUs", "fa_icon": "fas fa-microchip" }, + "split_fasta": { + "type": "boolean", + "description": "Split input multi-fasta file in separated fasta files each of them containing one sequence to be folded", + "fa_icon": "fas fa-microchip" + }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index fa0545a6..c9bd0d57 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -66,6 +66,25 @@ workflow PIPELINE_INITIALISATION { // Create channel from input file provided through params.input // ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) + if (params.split_fasta) { + // TODO: here we have to validate that the ids are unique and valid as an extra step + // since it is not done with the samplesheet schema (they are all in the same file) + ch_samplesheet.map { meta, fasta -> + validateFasta(fasta) + } + + // Split the fasta file into individual files for each sequence + ch_samplesheet + .map { meta,fasta -> fasta } + .splitFasta( record: [header: true, sequence: true] ) + .collectFile { item -> + [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ] + } + .map { + file -> [[id: file.baseName], file] + } + .set { ch_samplesheet } + } emit: samplesheet = ch_samplesheet @@ -214,3 +233,22 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } +def cleanHeader(header) { + return header.replaceAll(" ", "_").replaceAll(",", "").replaceAll(";","") +} + +def validateFasta(fasta) { + // extract headers + def headers = fasta.findAll { it.startsWith('>') } + // if headers are not unique, throw an error + if (headers.size() != headers.unique().size()) { + throw new Exception("Invalid FASTA file. The headers are not unique.") + } + // check headers that are malformed + headers.each { header -> + if (header =~ /[ \t;,]/) { + // warn user that the header contains special characters + log.warn "The header ${header} contains special characters. They have been automatically removed." + } + } +}