From e87572cea26295eb256f60232c540dd765844600 Mon Sep 17 00:00:00 2001 From: luisas Date: Wed, 23 Oct 2024 18:19:11 +0200 Subject: [PATCH 01/40] first commit --- .github/workflows/ci.yml | 1 + assets/samplesheet.csv | 6 +-- assets/schema_input.json | 11 +++++- conf/modules_colabfold.config | 16 ++++++-- conf/test_split_fasta.config | 39 +++++++++++++++++++ docs/usage.md | 2 + nextflow.config | 2 + nextflow_schema.json | 5 +++ .../utils_nfcore_proteinfold_pipeline/main.nf | 12 ++++++ 9 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 conf/test_split_fasta.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 161ca5e8..196a9393 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,6 +43,7 @@ jobs: - "test_colabfold_webserver" - "test_colabfold_download" - "test_esmfold" + - "test_split_fasta" isMaster: - ${{ github.base_ref == 'master' }} # Exclude conda and singularity on dev diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 467fdcf0..5e7df047 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sequence,fasta -T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta -T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta +sequence,fasta,reference,dependencies +seatoxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/af2_structures/seatoxin-ref.tar.gz +toxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin.ref, \ No newline at end of file diff --git a/assets/schema_input.json b/assets/schema_input.json index c261ae58..2bbdf919 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,6 +13,12 @@ "errorMessage": "Sequence name must be provided and cannot contain spaces", "meta": ["id"] }, + "id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sequence name must be provided and cannot contain spaces", + "meta": ["id"] + }, "fasta": { "type": "string", "format": "file-path", @@ -21,6 +27,9 @@ "errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'" } }, - "required": ["sequence", "fasta"] + "oneOf": [ + { "required": ["sequence", "fasta"] }, + { "required": ["id", "fasta"] } + ] } } diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 2efcfa01..00da59e7 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -67,10 +67,18 @@ if (params.colabfold_server == 'local') { params.use_templates ? '--templates' : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_relaxed_rank_01.pdb' + ], ] } } diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config new file mode 100644 index 00000000..c3feb113 --- /dev/null +++ b/conf/test_split_fasta.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_colabfold_local, --outdir +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test colabfold with the colabfold webserver analysis + mode = 'colabfold' + colabfold_server = 'local' + split_fasta = true + colabfold_db = "${projectDir}/assets/dummy_db_dir" + //input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' + input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' +} + +process { + withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/docs/usage.md b/docs/usage.md index 43b0d86a..d4502203 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,6 +35,8 @@ The samplesheet can have as many columns as you desire, however, there is a stri An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +Each FASTA file is assumed to contain a single protein sequence unless you are using multimer mode. If you want to provide a FASTA file with multiple protein sequences, each to be folded individually, you can supply one or more FASTA files containing one or more sequences and use the --split_fasta parameter. In this case, each sequence in the FASTA file will be folded individually and in parallel, as if you had listed each sequence separately in the samplesheet. + ## Running the pipeline The typical commands for running the pipeline on AlphaFold2, Colabfold and ESMFold modes are shown below. diff --git a/nextflow.config b/nextflow.config index d8fc2623..ed874e2c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null mode = 'alphafold2' // {alphafold2, colabfold, esmfold} use_gpu = false + split_fasta = false // Alphafold2 parameters alphafold2_mode = "standard" @@ -240,6 +241,7 @@ profiles { test_colabfold_webserver { includeConfig 'conf/test_colabfold_webserver.config' } test_colabfold_download { includeConfig 'conf/test_colabfold_download.config' } test_esmfold { includeConfig 'conf/test_esmfold.config' } + test_split_fasta { includeConfig 'conf/test_split_fasta.config' } test_full { includeConfig 'conf/test_full.config' } test_full_alphafold2_standard { includeConfig 'conf/test_full.config' } test_full_alphafold2_split { includeConfig 'conf/test_full_alphafold_split.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 313997a8..8df979ce 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -40,6 +40,11 @@ "description": "Run on CPUs (default) or GPUs", "fa_icon": "fas fa-microchip" }, + "split_fasta": { + "type": "boolean", + "description": "Split input fasta file in multiple fasta files each of them containing one sequence to be folded", + "fa_icon": "fas fa-microchip" + }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index fa0545a6..9c3ebe1c 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -67,6 +67,18 @@ workflow PIPELINE_INITIALISATION { // ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) + if (params.split_fasta) { + + ch_samplesheet.splitFasta(record: [id:true]) + .map{ record -> record.id.toString() } + .set{ ID }.view() + ch_samplesheet = ch_samplesheet.map{meta, fasta -> fasta} + .splitFasta( by:1, file: true ) + .map{fasta -> [[id:record.id], fasta ]}.view() + } + + ch_samplesheet.view() + emit: samplesheet = ch_samplesheet versions = ch_versions From d78bf35f140e51ad575f2319e56212ecab656eec Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 16:04:39 +0200 Subject: [PATCH 02/40] update --- assets/samplesheet.csv | 3 -- assets/schema_input.json | 11 +------ conf/modules.config | 8 +++++ conf/modules_alphafold2.config | 30 +++++++++++++++---- conf/modules_colabfold.config | 20 +++++++++---- conf/modules_esmfold.config | 11 +++++-- conf/test_split_fasta.config | 6 ++-- docs/output.md | 14 +++++---- main.nf | 6 +++- .../utils_nfcore_proteinfold_pipeline/main.nf | 27 +++++++++++------ 10 files changed, 90 insertions(+), 46 deletions(-) delete mode 100644 assets/samplesheet.csv diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5e7df047..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sequence,fasta,reference,dependencies -seatoxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/af2_structures/seatoxin-ref.tar.gz -toxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin.ref, \ No newline at end of file diff --git a/assets/schema_input.json b/assets/schema_input.json index 2bbdf919..c261ae58 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,12 +13,6 @@ "errorMessage": "Sequence name must be provided and cannot contain spaces", "meta": ["id"] }, - "id": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sequence name must be provided and cannot contain spaces", - "meta": ["id"] - }, "fasta": { "type": "string", "format": "file-path", @@ -27,9 +21,6 @@ "errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'" } }, - "oneOf": [ - { "required": ["sequence", "fasta"] }, - { "required": ["id", "fasta"] } - ] + "required": ["sequence", "fasta"] } } diff --git a/conf/modules.config b/conf/modules.config index c56b11eb..5f6fbd9f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,6 +50,14 @@ process { ] } + withName: 'GENERATE_REPORT'{ + publishDir = [ + path: { "${params.outdir}/report" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'FOLDSEEK_EASYSEARCH' { ext.args = { params.foldseek_easysearch_arg ? "$params.foldseek_easysearch_arg" : "--format-mode 3" } publishDir = [ diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index 33b04c38..c8b4fab3 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -40,9 +40,18 @@ if (params.alphafold2_mode == 'standard') { params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*.1.alphafold.pdb' + ] ] } } @@ -64,9 +73,18 @@ if (params.alphafold2_mode == 'split_msa_prediction') { if(params.use_gpu) { accelerator = 1 } ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: 'ranked_0.pdb' + ] ] } } diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 00da59e7..ecf87d75 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -30,10 +30,18 @@ if (params.colabfold_server == 'webserver') { params.host_url ? "--host-url ${params.host_url}" : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + [ + path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_relaxed_rank_01.pdb' + ] ] } } @@ -68,13 +76,13 @@ if (params.colabfold_server == 'local') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}/complete_results" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index d8356924..ba523450 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -14,11 +14,18 @@ process { withName: 'RUN_ESMFOLD' { ext.args = {params.use_gpu ? '' : '--cpu-only'} publishDir = [ - path: { "${params.outdir}/esmfold" }, + [ + path: { "${params.outdir}/esmfold/complete_results" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/esmfold" }, + mode: 'copy', + pattern: '*.pdb' ] + ] } withName: 'NFCORE_PROTEINFOLD:ESMFOLD:MULTIQC' { diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index c3feb113..44130987 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -24,12 +24,12 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data to test colabfold with the colabfold webserver analysis - mode = 'colabfold' + mode = 'colabold' colabfold_server = 'local' split_fasta = true colabfold_db = "${projectDir}/assets/dummy_db_dir" - //input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' + //input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' } process { diff --git a/docs/output.md b/docs/output.md index 9b9a8fb8..291614a5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,10 +23,9 @@ The directories listed below will be created in the output directory after the p
Output files -- `AlphaFold2/` - - `/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings - - `.alphafold.pdb` that is the structure with the highest pLDDT score (ranked first) - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models +- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. + - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `.pdb` that is the structure with the highest pLDDT score (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -91,7 +90,9 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
Output files -- `colabfold/webserver/` or `colabfold/local/` based on the selected mode that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs and scores, prediction metadata, logs and section timings +- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. + - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `.pdb` that is the structure with the highest pLDDT score (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -117,7 +118,8 @@ Below you can find some indicative examples of the output images produced by Col - `esmfold/` - `.pdb` that is the structure with the highest pLDDT score (ranked first) - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models + - `complete_results` + - `_plddt_mqc.tsv` that presents the pLDDT scores per residue. - `DBs/` that contains symbolic links to the downloaded database and parameter files diff --git a/main.nf b/main.nf index eaf0eac1..34c1f7e0 100644 --- a/main.nf +++ b/main.nf @@ -64,6 +64,7 @@ workflow NFCORE_PROTEINFOLD { ch_multiqc = Channel.empty() ch_versions = Channel.empty() ch_report_input = Channel.empty() + ch_outputsheet = Channel.empty() // // WORKFLOW: Run alphafold2 @@ -146,7 +147,6 @@ workflow NFCORE_PROTEINFOLD { params.create_colabfold_index ) ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS.out.versions) - // // WORKFLOW: Run nf-core/colabfold workflow // @@ -159,6 +159,7 @@ workflow NFCORE_PROTEINFOLD { PREPARE_COLABFOLD_DBS.out.uniref30, params.num_recycles_colabfold ) + ch_multiqc = COLABFOLD.out.multiqc_report ch_versions = ch_versions.mix(COLABFOLD.out.versions) ch_report_input = ch_report_input.mix( @@ -168,6 +169,8 @@ workflow NFCORE_PROTEINFOLD { .join(COLABFOLD.out.msa) .map { it[0]["model"] = "colabfold"; it } ) + // ch_outputsheet = ch_report_input.transpose(by:1).filter{it[1].name.contains("rank_01")} + // ch_outputsheet.view() } // @@ -231,6 +234,7 @@ workflow NFCORE_PROTEINFOLD { ) } + emit: multiqc_report = ch_multiqc } diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index 9c3ebe1c..c3642f50 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -68,17 +68,26 @@ workflow PIPELINE_INITIALISATION { ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) if (params.split_fasta) { - - ch_samplesheet.splitFasta(record: [id:true]) - .map{ record -> record.id.toString() } - .set{ ID }.view() - ch_samplesheet = ch_samplesheet.map{meta, fasta -> fasta} - .splitFasta( by:1, file: true ) - .map{fasta -> [[id:record.id], fasta ]}.view() + // Extract all sequence headers from the fasta file + // to keep track of which sequences belong to which dataset + // and create a new channel [[id:{dataset_id}, sequence:{sequence_id}]] + ch_samplesheet.splitFasta(by:1, record: [header:true]) + .map{meta, record -> [record.header, meta]} + .set{dataset_sequence_mapping} + + // Split the fasta file into individual files for each sequence + ch_samplesheet.map{ meta,fasta -> fasta} + .splitFasta( record: [id: true, sequence: true] ) + .collectFile { item -> + [ "${item["id"]}.fa", ">" + item["id"] + '\n' +item["sequence"] ] + }.map{ + file -> [file.baseName, file] + }.combine(dataset_sequence_mapping, by:0) + .map{ + id, file, meta -> [[id:id, dataset:meta.id], file] + }.set{ch_samplesheet} } - ch_samplesheet.view() - emit: samplesheet = ch_samplesheet versions = ch_versions From 66968b9c6d02e86672c39151fd9ba402593768de Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:10:39 +0200 Subject: [PATCH 03/40] update --- conf/test_split_fasta.config | 5 +-- .../utils_nfcore_proteinfold_pipeline/main.nf | 40 +++++++++++++------ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index 44130987..9eca8853 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -24,16 +24,15 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data to test colabfold with the colabfold webserver analysis - mode = 'colabold' + mode = 'colabfold' colabfold_server = 'local' split_fasta = true colabfold_db = "${projectDir}/assets/dummy_db_dir" input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - //input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' } process { withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { container = 'biocontainers/gawk:5.1.0' } -} +} \ No newline at end of file diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index c3642f50..faf0b3ff 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -66,25 +66,20 @@ workflow PIPELINE_INITIALISATION { // Create channel from input file provided through params.input // ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) - if (params.split_fasta) { - // Extract all sequence headers from the fasta file - // to keep track of which sequences belong to which dataset - // and create a new channel [[id:{dataset_id}, sequence:{sequence_id}]] - ch_samplesheet.splitFasta(by:1, record: [header:true]) - .map{meta, record -> [record.header, meta]} - .set{dataset_sequence_mapping} + // here we have to validate that the ids are unique and valid as an extra step + // since it is not done with the samplesheet schema (they are all in the same file) + ch_samplesheet.map { meta, fasta -> + validateFasta(fasta) + } // Split the fasta file into individual files for each sequence ch_samplesheet.map{ meta,fasta -> fasta} - .splitFasta( record: [id: true, sequence: true] ) + .splitFasta( record: [header: true, sequence: true] ) .collectFile { item -> - [ "${item["id"]}.fa", ">" + item["id"] + '\n' +item["sequence"] ] + [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ] }.map{ - file -> [file.baseName, file] - }.combine(dataset_sequence_mapping, by:0) - .map{ - id, file, meta -> [[id:id, dataset:meta.id], file] + file -> [[id: file.baseName], file] }.set{ch_samplesheet} } @@ -235,3 +230,22 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } +def cleanHeader(header) { + return header.replaceAll(" ", "_").replaceAll(",", "").replaceAll(";","") +} + +def validateFasta(fasta) { + // extract headers + def headers = fasta.findAll { it.startsWith('>') } + // if headers are not unique, throw an error + if (headers.size() != headers.unique().size()) { + throw new Exception("Invalid FASTA file. The headers are not unique.") + } + // check headers that are malformed + headers.each { header -> + if (header =~ /[ \t;,]/) { + // warn user that the header contains special characters + log.warn "The header ${header} contains special characters. They have been automatically removed." + } + } +} \ No newline at end of file From a2ab2cedc1532da32c2f1168a8691208a9a216ed Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:18:20 +0200 Subject: [PATCH 04/40] revert main.nf --- main.nf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 34c1f7e0..eaf0eac1 100644 --- a/main.nf +++ b/main.nf @@ -64,7 +64,6 @@ workflow NFCORE_PROTEINFOLD { ch_multiqc = Channel.empty() ch_versions = Channel.empty() ch_report_input = Channel.empty() - ch_outputsheet = Channel.empty() // // WORKFLOW: Run alphafold2 @@ -147,6 +146,7 @@ workflow NFCORE_PROTEINFOLD { params.create_colabfold_index ) ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS.out.versions) + // // WORKFLOW: Run nf-core/colabfold workflow // @@ -159,7 +159,6 @@ workflow NFCORE_PROTEINFOLD { PREPARE_COLABFOLD_DBS.out.uniref30, params.num_recycles_colabfold ) - ch_multiqc = COLABFOLD.out.multiqc_report ch_versions = ch_versions.mix(COLABFOLD.out.versions) ch_report_input = ch_report_input.mix( @@ -169,8 +168,6 @@ workflow NFCORE_PROTEINFOLD { .join(COLABFOLD.out.msa) .map { it[0]["model"] = "colabfold"; it } ) - // ch_outputsheet = ch_report_input.transpose(by:1).filter{it[1].name.contains("rank_01")} - // ch_outputsheet.view() } // @@ -234,7 +231,6 @@ workflow NFCORE_PROTEINFOLD { ) } - emit: multiqc_report = ch_multiqc } From 91dead17fbfa29ddc937fada5677b377efb6a9e4 Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:28:49 +0200 Subject: [PATCH 05/40] fix output folder --- conf/modules_alphafold2.config | 12 ++++++------ conf/modules_colabfold.config | 8 ++++---- conf/modules_esmfold.config | 6 +++--- docs/output.md | 15 ++++++--------- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index c8b4fab3..b9deab54 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -40,14 +40,14 @@ if (params.alphafold2_mode == 'standard') { params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' ].join(' ').trim() publishDir = [ - [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*.1.alphafold.pdb' @@ -63,7 +63,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { withName: 'RUN_ALPHAFOLD2_MSA' { ext.args = params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -74,13 +74,13 @@ if (params.alphafold2_mode == 'split_msa_prediction') { ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: 'ranked_0.pdb' diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index ecf87d75..922a3da5 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -31,13 +31,13 @@ if (params.colabfold_server == 'webserver') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' @@ -76,13 +76,13 @@ if (params.colabfold_server == 'local') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index ba523450..92c2405a 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -15,13 +15,13 @@ process { ext.args = {params.use_gpu ? '' : '--cpu-only'} publishDir = [ [ - path: { "${params.outdir}/esmfold/complete_results" }, + path: { "${params.outdir}/esmfold" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.tsv' + pattern: '*' ], [ - path: { "${params.outdir}/esmfold" }, + path: { "${params.outdir}/esmfold/best_structures" }, mode: 'copy', pattern: '*.pdb' ] diff --git a/docs/output.md b/docs/output.md index 291614a5..faa7da7f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,9 +23,8 @@ The directories listed below will be created in the output directory after the p
Output files -- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. - - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - - `.pdb` that is the structure with the highest pLDDT score (ranked first) +- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -90,9 +89,8 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
Output files -- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. - - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - - `.pdb` that is the structure with the highest pLDDT score (ranked first) +- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -117,9 +115,8 @@ Below you can find some indicative examples of the output images produced by Col Output files - `esmfold/` - - `.pdb` that is the structure with the highest pLDDT score (ranked first) - - `complete_results` - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue. + contains the predicted structures. + - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From a22a92d8d25afaec9700dffa933fab793bce9d3a Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:44:01 +0200 Subject: [PATCH 06/40] update --- conf/modules_alphafold2.config | 2 +- conf/test_split_fasta.config | 2 +- .../local/utils_nfcore_proteinfold_pipeline/main.nf | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index b9deab54..c27defbd 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -73,7 +73,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { if(params.use_gpu) { accelerator = 1 } ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ - [ + [ path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index 9eca8853..b7e5ead0 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -35,4 +35,4 @@ process { withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { container = 'biocontainers/gawk:5.1.0' } -} \ No newline at end of file +} diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index faf0b3ff..6611eefe 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -235,17 +235,17 @@ def cleanHeader(header) { } def validateFasta(fasta) { - // extract headers + // extract headers def headers = fasta.findAll { it.startsWith('>') } // if headers are not unique, throw an error if (headers.size() != headers.unique().size()) { throw new Exception("Invalid FASTA file. The headers are not unique.") } - // check headers that are malformed + // check headers that are malformed headers.each { header -> if (header =~ /[ \t;,]/) { // warn user that the header contains special characters log.warn "The header ${header} contains special characters. They have been automatically removed." } } -} \ No newline at end of file +} From 58a19c891ccfd2d95489c9ea62430c6fb52dd17a Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:45:40 +0200 Subject: [PATCH 07/40] fix lintin --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index faa7da7f..cc74c1d0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -115,7 +115,7 @@ Below you can find some indicative examples of the output images produced by Col Output files - `esmfold/` - contains the predicted structures. + contains the predicted structures. - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From 8e84a62c39d12fcb8509ea246012088645ef6213 Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Tue, 12 Nov 2024 11:10:42 +1100 Subject: [PATCH 08/40] update comparison report --- assets/comparison_template.html | 50 +++++++++++++++++-------------- bin/generate_comparison_report.py | 2 +- bin/generate_report.py | 2 +- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/assets/comparison_template.html b/assets/comparison_template.html index 44158b03..fbf416b1 100644 --- a/assets/comparison_template.html +++ b/assets/comparison_template.html @@ -1,4 +1,4 @@ - + @@ -49,10 +49,6 @@ rgba(3, 30, 148, 1) 100% ); } - #lddt_container .modebar { - display: flex !important; - flex-direction: row !important; - } @@ -63,13 +59,13 @@ -->
Protein structure comparison
-
+
-
+
-
+
-->
-
+
Navigation
@@ -257,10 +253,8 @@
-
-
+
+
@@ -308,14 +302,12 @@
pLDDT
-
-
-
-
+
+
-
+
Sequence Coverage
@@ -693,7 +685,7 @@ bubbles: true, cancelable: true, view: window, - }), + }) ); document.body.removeChild(saveLink); }; @@ -774,6 +766,18 @@ function generateImages() { const container = document.getElementById("seq_coverage_container"); SEQ_COV_IMGS.forEach((item, index) => { + const plotContainer = document.createElement("div"); + + const label = document.createElement("div"); + label.className = "font-semibold mb-2"; + + if (MODELS[index].includes("colabfold")) { + if (!MODELS[index + 1]) return; + label.textContent = MODELS[index + 1].replace(".pdb", ""); + } else { + label.textContent = MODELS[index].replace(".pdb", ""); + } + const imgContainer = document.createElement("div"); imgContainer.className = "w-[660px] h-auto p-6 bg-white shadow-md rounded items-center"; @@ -782,7 +786,9 @@ img.alt = `Sequence Coverage Image ${index + 1}`; imgContainer.append(img); - container.append(imgContainer); + plotContainer.append(label); + plotContainer.append(imgContainer); + container.append(plotContainer); }); } diff --git a/bin/generate_comparison_report.py b/bin/generate_comparison_report.py index bea765f9..165f2bdc 100755 --- a/bin/generate_comparison_report.py +++ b/bin/generate_comparison_report.py @@ -50,7 +50,7 @@ def generate_output(plddt_data, name, out_dir, generate_tsv, pdb): linecolor="black", gridcolor="WhiteSmoke", ), - legend=dict(y=0, x=1), + legend=dict(yanchor="bottom", y=0.02, xanchor="right", x=1, bordercolor="Black", borderwidth=1), plot_bgcolor="white", width=600, height=600, diff --git a/bin/generate_report.py b/bin/generate_report.py index b6cfa390..93fad4a6 100755 --- a/bin/generate_report.py +++ b/bin/generate_report.py @@ -120,7 +120,7 @@ def generate_output_images(msa_path, plddt_data, name, out_dir, in_type, generat linecolor="black", gridcolor="WhiteSmoke", ), - legend=dict(yanchor="bottom", y=0, xanchor="right", x=1.3), + legend=dict(yanchor="bottom", y=0.02, xanchor="right", x=1, bordercolor="Black", borderwidth=1), plot_bgcolor="white", width=600, height=600, From c30741bb1c1a57526228154bcc9c41259d67da17 Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Tue, 12 Nov 2024 13:31:26 +1100 Subject: [PATCH 09/40] update proteinfold template --- assets/proteinfold_template.html | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/assets/proteinfold_template.html b/assets/proteinfold_template.html index 2bb4c6ff..f2c70162 100644 --- a/assets/proteinfold_template.html +++ b/assets/proteinfold_template.html @@ -1,4 +1,4 @@ - + @@ -71,8 +71,8 @@ -->
Protein structure prediction
@@ -177,7 +177,7 @@
@@ -406,9 +406,7 @@
-
+
Sequence Coverage
@@ -793,7 +791,7 @@ bubbles: true, cancelable: true, view: window, - }), + }) ); document.body.removeChild(saveLink); }; From c997f44ed5447da16622c82555b4b0db7053c0ad Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Tue, 12 Nov 2024 14:01:33 +1100 Subject: [PATCH 10/40] update navbar --- assets/comparison_template.html | 14 ++++++++------ assets/proteinfold_template.html | 12 +++++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/assets/comparison_template.html b/assets/comparison_template.html index fbf416b1..3f097241 100644 --- a/assets/comparison_template.html +++ b/assets/comparison_template.html @@ -53,19 +53,21 @@ -