add reviewer suggestions nf-core#135 pt.1

Daniel-VM · May 23, 2024 · 8967e55 · 8967e55
1 parent 0410564
commit 8967e55
Show file tree

Hide file tree

Showing 11 changed files with 79 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -48,12 +48,10 @@ For users specifying both short read and long read (NanoPore) data, the pipeline
 
 In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core).
 
-In specific cases where samples recorded in the input samplesheet belong to more than one species, the pipeline finds and downloads their respectve reference genomes (this also works with single specie input samplesheet). It then groups the samples into batches and collects assembly QC results based on their corresponding reference genomes.
+If Kmerfinder is invoked, the pipeline will group samples according to the [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/)-estimated reference genomes. Afterwards, two QUAST steps will be carried out: an initial ('general') [QUAST](http://bioinf.spbau.ru/quast) of all samples without reference genomes, and subsequently, a 'by reference genome' [QUAST](http://bioinf.spbau.ru/quast) to aggregate samples with their reference genomes.
 
 > NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only.
 
-In cases where input samplesheet has files where , the pipeline will group samples in batches according to their reference genomes and will provide a general QUAST containing all the input samples and a by reference genome QUAST report, that is, a quast report for each reference genome.
-
 ## Usage
 
 > [!NOTE]

diff --git a/conf/modules.config b/conf/modules.config
@@ -31,15 +31,21 @@ process {
         ]
     }
 
-    // FIXME: SAVE TRIMMED NOT WORKING
     withName: 'PORECHOP_PORECHOP' {
         ext.args = ''
         ext.prefix = { "${meta.id}.porechop" }
         publishDir = [
-            path: { "${params.outdir}/trimming/longreads" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-            enable: params.save_trimmed
+            [
+                path: { "${params.outdir}/trimming/longreads" },
+                pattern: "*.fastq.gz",
+                mode: params.publish_dir_mode,
+                enabled: params.save_trimmed
+            ],
+            [
+                path: { "${params.outdir}/trimming/longreads" },
+                pattern: "*.log",
+                mode: params.publish_dir_mode,
+            ]
         ]
     }
 
@@ -178,13 +184,11 @@ process {
         ]
     }
 
-    // FIXME: output structure and meta updated. It might require a fixme
     withName: 'QUAST|QUAST_BYREFSEQID' {
         ext.args = ''
         publishDir = [
             path: { "${params.outdir}/QUAST" },
             mode: params.publish_dir_mode,
-            //pattern: "{report,runs_per_reference/*}/{report.html,report.pdf,icarus.html}",
             saveAs: { filename ->
                 if (filename.equals('versions.yml') || filename.endsWith('.tsv')){
                     null
@@ -217,7 +221,7 @@ process {
         ]
     }
 
-    withName: 'MULTIQC' {
+    withName: 'MULTIQC_CUSTOM' {
         ext.args = '-k yaml'
         publishDir = [
             path: { "${params.outdir}/multiqc" },

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -17,7 +17,6 @@ params {
     // Input data for full size test
     input                       = 'https://raw.githubusercontent.com/nf-core/test-datasets/bacass/bacass_full.tsv'
     kraken2db                   = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_8gb_20210517.tar.gz'
-    kmerfinderdb                = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz?download=1'
+    kmerfinderdb                = 'https://zenodo.org/records/10458361/files/20190108_kmerfinder_stable_dirs.tar.gz'
     ncbi_assembly_metadata      = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt'
-
 }
diff --git a/conf/test_hybrid_dragonflye.config b/conf/test_hybrid_dragonflye.config
@@ -23,9 +23,9 @@ params {
     input = params.pipelines_testdata_base_path + 'bacass/bacass_hybrid_dragonflye.tsv'
 
     // some extra args to speed tests up
-    assembly_type   ='hybrid'
-    assembler       ='dragonflye'
-    prokka_args     =" --fast"
+    assembly_type   = 'hybrid'
+    assembler       = 'dragonflye'
+    prokka_args     = " --fast"
     skip_kraken2    = true
     skip_kmerfinder = true
 }
diff --git a/docs/output.md b/docs/output.md
@@ -126,11 +126,9 @@ The pipeline includes a dedicated step for short and long reads QC as well as co
 <summary>Output files</summary>
 
 - `Kmerfinder/{ID}/`
-
   - `*_results.txt`: Kmerfinder report table containing reads QC results and taxonomic information.
-
-- `Kmerfinder/`:
-  - kmerfinder_summary.csv: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder.
+- `Kmerfinder/`
+  - `kmerfinder_summary.csv`: A CSV file containing the most relevant results of all samples analyzed with Kmerfinder.
 
 </details>
 
@@ -197,12 +195,9 @@ The assembly QC is performed with [QUAST](http://quast.sourceforge.net/quast) fo
 <summary>Output files</summary>
 
 - `QUAST/report/`
-
   - `icarus.html`: QUAST's contig browser as HTML
   - `report.html`: QUAST assembly QC as HTML report
   - `report.pdf`: QUAST assembly QC as pdf
-
-- `QUAST/runs_per_reference/{reference_assembly}/`
   - `icarus.html`: QUAST's contig browser as HTML
   - `report.html`: QUAST assembly QC as HTML report
   - `report.pdf`: QUAST assembly QC as pdf

diff --git a/modules/local/kmerfinder.nf b/modules/local/kmerfinder.nf
@@ -8,8 +8,7 @@ process KMERFINDER {
         'biocontainers/kmerfinder:3.0.2--hdfd78af_0' }"
 
     input:
-    tuple val(meta), path(reads)
-    path(kmerfinder_db)
+    tuple val(meta), path(reads), path(kmerfinder_db)
 
     output:
     tuple val(meta), path("*_results.txt")  , emit: report
@@ -19,7 +18,8 @@ process KMERFINDER {
     script:
     def prefix   = task.ext.prefix ?: "${meta.id}"
     def in_reads = reads[0] && reads[1] ? "${reads[0]} ${reads[1]}" : "${reads}"
-
+    // WARNING: Ensure to update software version in this line if you modify the container/environment.
+    def kmerfinder_version = "3.0.2"
     """
     kmerfinder.py \\
         --infile $in_reads \\
@@ -33,7 +33,7 @@ process KMERFINDER {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        kmerfinder: \$(echo "3.0.2")
+        kmerfinder: \$(echo "${kmerfinder_version}")
     END_VERSIONS
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -73,7 +73,7 @@ params {
     validate_params                 = true
     schema_ignore_params            = 'modules,igenomes_base'
     version                         = false
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/'
+    pipelines_testdata_base_path    = 'https://raw.githubusercontent.com/nf-core/test-datasets/'
 
 
     // Config options

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -95,7 +95,7 @@
                 },
                 "ncbi_assembly_metadata": {
                     "type": "string",
-                    "description": "Master file (*.txt) containing a summary of asseblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt"
+                    "description": "Master file (*.txt) containing a summary of assemblies available in GeneBank or RefSeq. See: https://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt"
                 }
             }
         },

diff --git a/subworkflows/local/kmerfinder_subworkflow.nf b/subworkflows/local/kmerfinder_subworkflow.nf
@@ -1,36 +1,50 @@
 //
 // Kmerfinder subworkflow for species identification & QC
 //
+include { UNTAR                     } from '../../modules/nf-core/untar/main'
 include { KMERFINDER                } from '../../modules/local/kmerfinder'
 include { KMERFINDER_SUMMARY        } from '../../modules/local/kmerfinder_summary'
 include { FIND_DOWNLOAD_REFERENCE   } from '../../modules/local/find_download_reference'
-include { QUAST                     } from '../../modules/nf-core/quast/main'
 
 workflow KMERFINDER_SUBWORKFLOW {
     take:
-    kmerfinder_db           // channel: [ path ]
-    ncbi_assembly_metadata  // channel: [ path ]
     reads                   // channel: [ meta, reads ]
     consensus               // channel: [ meta, consensus ]
 
     main:
     ch_versions = Channel.empty()
 
+    // Prepare kmerfinder database
+    ch_kmerfinderdb           = file(params.kmerfinderdb, checkIfExists: true)
+    ch_ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true)
+
+    if ( ch_kmerfinderdb.name.endsWith('.gz') ) {
+        UNTAR ( [[ id: ch_kmerfinderdb.getSimpleName() ], ch_kmerfinderdb] )
+        ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file }
+        ch_versions = ch_versions.mix(UNTAR.out.versions)
+    } else {
+        ch_kmerfinderdb_untar = Channel.from(params.kmerfinderdb)
+    }
+
     // MODULE: Kmerfinder, QC for sample purity. Identifies reference specie and reference genome assembly for each sample.
+    reads
+        .combine(ch_kmerfinderdb_untar)
+        .map{ meta, reads, db -> tuple(meta, reads, db) }
+        .set{ ch_to_kmerfinder }
+
     KMERFINDER (
-        reads,
-        kmerfinder_db
+        ch_to_kmerfinder
     )
     ch_kmerfinder_report    = KMERFINDER.out.report
     ch_kmerfinder_json      = KMERFINDER.out.json
-    ch_versions             = ch_versions.mix( KMERFINDER.out.versions.ifEmpty(null) )
+    ch_versions             = ch_versions.mix(KMERFINDER.out.versions)
 
     // MODULE: Kmerfinder summary report. Generates a csv report file collecting all sample references.
     KMERFINDER_SUMMARY (
         ch_kmerfinder_report.map{ meta, report -> report }.collect()
     )
     ch_summary_yaml     = KMERFINDER_SUMMARY.out.yaml
-    ch_versions         = ch_versions.mix( KMERFINDER_SUMMARY.out.versions.ifEmpty(null) )
+    ch_versions         = ch_versions.mix(KMERFINDER_SUMMARY.out.versions)
 
     // SUBWORKFLOW:  Create a channel to organize assemblies and reports based on the identified Kmerfinder reference.
     ch_kmerfinder_json
@@ -47,9 +61,9 @@ workflow KMERFINDER_SUBWORKFLOW {
     // SUBWORKFLOW: For each species target, this subworkflow collects reference genome assemblies ('GCF*') and subsequently downloads the best matching reference assembly.
     FIND_DOWNLOAD_REFERENCE (
         ch_reports_byreference.map{ specie, meta, report_txt, fasta-> tuple(specie, report_txt) },
-        ncbi_assembly_metadata
+        ch_ncbi_assembly_metadata
     )
-    ch_versions = ch_versions.mix( FIND_DOWNLOAD_REFERENCE.out.versions.ifEmpty(null) )
+    ch_versions = ch_versions.mix(FIND_DOWNLOAD_REFERENCE.out.versions)
 
     // Organize sample assemblies into channels based on their corresponding reference files.
     ch_reports_byreference
@@ -63,7 +77,7 @@ workflow KMERFINDER_SUBWORKFLOW {
         .set { ch_consensus_byrefseq }
 
     emit:
-    versions            = ch_versions.ifEmpty(null) // channel: [ path(versions.yml) ]
+    versions            = ch_versions               // channel: [ path(versions.yml) ]
     summary_yaml        = ch_summary_yaml           // channel: [ path(kmerfinder_summary.yml) ]
     consensus_byrefseq  = ch_consensus_byrefseq     // channel: [ refmeta, meta, fasta, fna, gff ]
 }
diff --git a/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf b/subworkflows/local/utils_nfcore_bacass_pipeline/main.nf
@@ -75,7 +75,7 @@ workflow PIPELINE_INITIALISATION {
     //
     // Custom validation for pipeline parameters
     //
-    //validateInputParameters()
+    validateInputParameters()
 
     //
     // Create channel from input file provided through params.input
@@ -156,6 +156,26 @@ workflow PIPELINE_COMPLETION {
 //
 def validateInputParameters() {
     // Add functions here for parameters validation
+    // Check Kraken2 dependencies
+    if (!params.skip_kraken2 && !params.kraken2db) {
+        def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+            "  Kraken2 database not provided.\n" +
+            "  Please specify the '--kraken2db' parameter to provide the necessary database.\n" +
+            "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+        error(error_string)
+    }
+
+    // Check kmerfinder dependencies
+    if (!params.skip_kmerfinder) {
+        if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) {
+            def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+                "  Kmerfinder database and NCBI assembly metadata not provided.\n" +
+                "  Please specify the '--kmerfinderdb' and '--ncbi_assembly_metadata' parameters.\n" +
+                "  Both are required to run Kmerfinder.\n" +
+                "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            error(error_string)
+        }
+    }
 }
 
 //
@@ -184,7 +204,8 @@ def toolCitationText() {
             "ProeChop (Wick RR et al. 2017)",
             "Nanoplot (Wouter De Coster and Rosa Rademakers 2023)",
             "PycoQC (Adrien Leger & Tommaso Leonardi 2019)",
-            "Kreken2 (Derrick E. Wood et al. 2019)",
+            "Kraken2 (Derrick E. Wood et al. 2019)",
+            "Kmerfinder (Larsen et al. 2014)",
             "Unicycler (Ryan R Wick et al. 2017)",
             "Minimap & Miniasm (Heng Li 2016)",
             "Dragonflye (Robert A Petit III )",
@@ -212,6 +233,7 @@ def toolBibliographyText() {
             "<li>Wouter De Coster, Rosa Rademakers, NanoPack2: population-scale evaluation of long-read sequencing data, Bioinformatics, Volume 39, Issue 5, May 2023, btad311, https://doi.org/10.1093/bioinformatics/btad311</li>",
             "<li>Leger et al., (2019). pycoQC, interactive quality control for Oxford Nanopore Sequencing. Journal of Open Source Software, 4(34), 1236, https://doi.org/10.21105/joss.01236</li>",
             "<li>Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). https://doi.org/10.1186/s13059-019-1891-0</li>",
+            "<li>RBenchmarking of Methods for Genomic Taxonomy. Larsen MV, Cosentino S, Lukjancenko O, Saputra D, Rasmussen S, Hasman H, Sicheritz-Pontén T, Aarestrup FM, Ussery DW, Lund O. J Clin Microbiol. 2014 Feb 26.</li>",
             "<li>Wick RR, Judd LM, Gorrie CL, Holt KE. Unicycler: Resolving bacterial genome assemblies from short and long sequencing reads. PLoS Comput Biol. 2017 Jun 8;13(6):e1005595. doi: 10.1371/journal.pcbi.1005595.</li>",
             "<li>Heng Li, Minimap and miniasm: fast mapping and de novo assembly for noisy long sequences, Bioinformatics, Volume 32, Issue 14, July 2016, Pages 2103–2110, https://doi.org/10.1093/bioinformatics/btw152</li>",
             "<li>Petit III, R. A. dragonflye: assemble bacterial isolate genomes from Nanopore reads (Version 1.1.2). https://github.com/rpetit3/dragonflye</li>",

diff --git a/workflows/bacass.nf b/workflows/bacass.nf
@@ -8,24 +8,6 @@
 def checkPathParamList = [ params.input, params.multiqc_config, params.kraken2db, params.dfast_config ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
-// Check krakendb
-if (!params.skip_kraken2) {
-    if (params.kraken2db) {
-        kraken2db = file(params.kraken2db, checkIfExists: true)
-    } else {
-        exit 1, "Missing Kraken2 DB arg"
-    }
-}
-
-// Check kmerfinder dependencies
-if (!params.skip_kmerfinder) {
-    if (!params.kmerfinderdb || !params.ncbi_assembly_metadata) {
-        exit 1, "[KMERFINDER]: Missing --kmerfinder_db and/or --ncbi_assembly_metadata arguments. Both are required to run KMERFINDER."
-    } else {
-        kmerfinderdb = file(params.kmerfinderdb, checkIfExists: true)
-        ncbi_assembly_metadata = file(params.ncbi_assembly_metadata, checkIfExists: true)
-    }
-}
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     CONFIG FILES
@@ -89,7 +71,6 @@ include { KRAKEN2_KRAKEN2 as KRAKEN2_LONG       } from '../modules/nf-core/krake
 include { QUAST                                 } from '../modules/nf-core/quast/main'
 include { QUAST as QUAST_BYREFSEQID             } from '../modules/nf-core/quast/main'
 include { GUNZIP                                } from '../modules/nf-core/gunzip/main'
-include { UNTAR                                 } from '../modules/nf-core/untar/main'
 include { PROKKA                                } from '../modules/nf-core/prokka/main'
 
 //
@@ -411,31 +392,20 @@ workflow BACASS {
 
     ch_kmerfinder_multiqc = Channel.empty()
     if (!params.skip_kmerfinder) {
-        // Prepare kmerfinder database
-        if ( kmerfinderdb.name.endsWith('.gz') ) {
-            UNTAR ( [[ id: kmerfinderdb.getSimpleName() ], kmerfinderdb] )
-            ch_kmerfinderdb_untar = UNTAR.out.untar.map{ meta, file -> file }
-        } else {
-            ch_kmerfinderdb_untar = Channel.from(kmerfinder_db)
-        }
-
-        // Set kmerfinder input based on assembly type
+        // Set kmerfinder channel based on assembly type
         if( params.assembly_type == 'short' || params.assembly_type == 'hybrid' ) {
             ch_for_kmerfinder = FASTQ_TRIM_FASTP_FASTQC.out.reads
         } else if ( params.assembly_type == 'long' ) {
             ch_for_kmerfinder = PORECHOP_PORECHOP.out.reads
         }
-
         // RUN kmerfinder subworkflow
         KMERFINDER_SUBWORKFLOW (
-            ch_kmerfinderdb_untar,
-            ncbi_assembly_metadata,
             ch_for_kmerfinder,
             ch_assembly
         )
         ch_kmerfinder_multiqc   = KMERFINDER_SUBWORKFLOW.out.summary_yaml
         ch_consensus_byrefseq   = KMERFINDER_SUBWORKFLOW.out.consensus_byrefseq
-        ch_versions             = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions.ifEmpty(null))
+        ch_versions             = ch_versions.mix(KMERFINDER_SUBWORKFLOW.out.versions)
 
         // Set channel to perform by refseq QUAST based on reference genome identified with KMERFINDER.
         ch_consensus_byrefseq
@@ -476,7 +446,7 @@ workflow BACASS {
         )
         ch_quast_multiqc = QUAST_BYREFSEQID.out.results
     }
-    ch_versions = ch_versions.mix(QUAST.out.versions.ifEmpty(null))
+    ch_versions = ch_versions.mix(QUAST.out.versions)
 
     // Check assemblies that require further processing for gene annotation
     ch_assembly
@@ -571,15 +541,15 @@ workflow BACASS {
         ch_kraken_short_multiqc.collect{it[1]}.ifEmpty([]),
         ch_kraken_long_multiqc.collect{it[1]}.ifEmpty([]),
         ch_quast_multiqc.collect{it[1]}.ifEmpty([]),
-        ch_prokka_txt_multiqc.collect{it[1]}.ifEmpty([]),
-        ch_bakta_txt_multiqc.collect{it[1]}.ifEmpty([]),
+        ch_prokka_txt_multiqc.collect().ifEmpty([]),
+        ch_bakta_txt_multiqc.collect().ifEmpty([]),
         ch_kmerfinder_multiqc.collectFile(name: 'multiqc_kmerfinder.yaml').ifEmpty([]),
     )
     multiqc_report = MULTIQC_CUSTOM.out.report.toList()
 
     emit:
     multiqc_report = MULTIQC_CUSTOM.out.report.toList() // channel: /path/to/multiqc_report.html
-    versions       = ch_versions                 // channel: [ path(versions.yml) ]
+    versions       = ch_versions                        // channel: [ path(versions.yml) ]
 }
 
 /*