Merge branch 'buisciii-develop' into dev

Daniel-VM · May 20, 2024 · 11010c7 · 11010c7
2 parents 3a51224 + 0fca1d1
commit 11010c7
Show file tree

Hide file tree

Showing 32 changed files with 1,819 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -29,11 +29,12 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
 ### Short Read Assembly
 
-This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) to verify sample purity.
+This pipeline is primarily for bacterial assembly of next-generation sequencing reads. It can be used to quality trim your reads using [FastP](https://github.com/OpenGene/fastp) and performs basic sequencing QC using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Afterwards, the pipeline performs read assembly using [Unicycler](https://github.com/rrwick/Unicycler). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity.
 
 ### Long Read Assembly
 
-For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC).
+For users that only have Nanopore data, the pipeline quality trims these using [PoreChop](https://github.com/rrwick/Porechop) and assesses basic sequencing QC utilizing [NanoPlot](https://github.com/wdecoster/NanoPlot) and [PycoQC](https://github.com/a-slide/pycoQC). Contamination of the assembly is checked using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) to verify sample purity.
+
 The pipeline can then perform long read assembly utilizing [Unicycler](https://github.com/rrwick/Unicycler), [Miniasm](https://github.com/lh3/miniasm) in combination with [Racon](https://github.com/isovic/racon), [Canu](https://github.com/marbl/canu) or [Flye](https://github.com/fenderglass/Flye) by using the [Dragonflye](https://github.com/rpetit3/dragonflye)(\*) pipeline. Long reads assembly can be polished using [Medaka](https://github.com/nanoporetech/medaka) or [NanoPolish](https://github.com/jts/nanopolish) with Fast5 files.
 
 > [!NOTE]
@@ -47,6 +48,12 @@ For users specifying both short read and long read (NanoPore) data, the pipeline
 
 In all cases, the assembly is assessed using [QUAST](http://bioinf.spbau.ru/quast). The resulting bacterial assembly is furthermore annotated using [Prokka](https://github.com/tseemann/prokka), [Bakta](https://github.com/oschwengers/bakta) or [DFAST](https://github.com/nigyta/dfast_core).
 
+In specific cases where samples recorded in the input samplesheet belong to more than one species, the pipeline finds and downloads their respectve reference genomes (this also works with single specie input samplesheet). It then groups the samples into batches and collects assembly QC results based on their corresponding reference genomes.
+
+> NOTE: This scenario is supported when [Kmerfinder](https://bitbucket.org/genomicepidemiology/kmerfinder/src/master/) analysis is performed only.
+
+
+In cases where input samplesheet has files where , the pipeline will group samples in batches according to their reference genomes and will provide a general QUAST containing all the input samples and a by reference genome QUAST report, that is, a quast report for each reference genome.
 ## Usage
 
 > [!NOTE]

diff --git a/assets/multiqc_config_hybrid.yml b/assets/multiqc_config_hybrid.yml
@@ -0,0 +1,166 @@
+report_comment: >
+  This report has been generated by the <a href="https://github.com/nf-core/bacass/releases/tag/dev" target="_blank">nf-core/bacass</a>
+  analysis pipeline. For information about how to interpret these results, please see the
+  <a href="https://nf-co.re/bacass/dev/docs/output" target="_blank">documentation</a>.
+
+data_format: "yaml"
+
+max_table_rows: 10000
+
+run_modules:
+  - custom_content
+  - fastqc
+  - fastp
+  - nanostat
+  - porechop
+  - pycoqc
+  - kraken2
+  - quast
+  - prokka
+  - bakta
+
+exclude_modules:
+  - general_stats
+
+module_order:
+  - fastqc:
+      name: "PREPROCESS: FastQC (raw reads)"
+      info: "This section of the report shows FastQC results for the raw reads before adapter trimming."
+      path_filters:
+        - "./fastqc/*.zip"
+  - fastp:
+      name: "PREPROCESS: fastp (adapter trimming)"
+      info: "This section of the report shows fastp results for reads after adapter and quality trimming."
+      path_filters:
+        - "./fastp/*.json"
+  - nanostat:
+      name: "PREPROCESS: Nanoplot"
+      info: "This section of the report shows Nanoplot results for nanopore sequencing data."
+      path_filters:
+        - "./nanoplot/*.txt"
+  - porechop:
+      name: "PREPROCESS: Porechop"
+      info: "This section of the report shows Porechop results for reads after adapter trimming."
+      path_filters:
+        - "./porechop/*.log"
+  - pycoqc:
+      name: "PREPROCESS: PycoQC"
+      info: "This section of the report shows PycoQC results for quality control of long-read sequencing data."
+      path_filters:
+        - "./pycoqc/*.txt"
+  - kraken2:
+      name: "CONTAMINATION ANALYSIS: Kraken 2"
+      info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp."
+      path_filters:
+        - ".*kraken2_*/*report.txt"
+  - quast:
+      name: "ASSEMBLY: Quast"
+      info: "This section of the report shows Quast QC results for assembled genomes with Unicycler."
+      path_filters:
+        - "./quast/*/report.tsv"
+  - prokka:
+      name: "ANNOTATION: Prokka"
+      info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming."
+      path_filters:
+        - "./prokka/*.txt"
+  - bakta:
+      name: "ANNOTATION: Bakta"
+      info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming."
+      path_filters:
+        - "./bakta/*.txt"
+
+report_section_order:
+  fastqc:
+    after: general_stats
+  fastp:
+    after: general_stats
+  nanostat:
+    after: general_stats
+  porechop:
+    before: nanostat
+  kraken2:
+    after: general_stats
+  quast:
+    after: general_stats
+  prokka:
+    before: nf-core-bacass-methods-description
+  bakta:
+    before: nf-core-bacass-methods-description
+  nf-core-bacass-methods-description:
+    order: -1000
+  software_versions:
+    order: -1001
+  nf-core-bacass-summary:
+    order: -1002
+
+custom_data:
+  summary_assembly_metrics:
+    section_name: "De novo assembly metrics (shorts & long reads)"
+    description: "generated by nf-core/bacass"
+    plot_type: "table"
+    headers:
+      "Sample":
+        description: "Input sample names"
+        format: "{:,.0f}"
+      "# Input short reads":
+        description: "Total number of input reads in raw fastq files"
+        format: "{:,.0f}"
+      "# Trimmed short reads (fastp)":
+        description: "Total number of reads remaining after adapter/quality trimming with fastp"
+        format: "{:,.0f}"
+      "# Input long reads":
+        description: "Total number of input reads in raw fastq files"
+        format: "{:,.0f}"
+      "# Median long reads lenght":
+        description: "Median read lenght (bp)"
+        format: "{:,.0f}"
+      "# Median long reads quality":
+        description: "Median read quality (Phred scale)"
+        format: "{:,.0f}"
+      "# Contigs (hybrid assembly)":
+        description: "Total number of contigs calculated by QUAST"
+        format: "{:,.0f}"
+      "# Largest contig (hybrid assembly)":
+        description: "Size of largest contig calculated by QUAST"
+        format: "{:,.0f}"
+      "# N50 (hybrid assembly)":
+        description: "N50 metric for de novo assembly as calculated by QUAST"
+        format: "{:,.0f}"
+      "# % Genome fraction (hybrid assembly)":
+        description: "% genome fraction calculated by QUAST"
+        format: "{:,.2f}"
+      "# Best hit (Kmerfinder)":
+        description: "Specie name of the best hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Best hit assembly ID (Kmerfinder)":
+        description: "Assembly ID of the best hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Best hit query coverage (Kmerfinder)":
+        description: "Query coverage value of the best hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Best hit depth (Kmerfinder)":
+        description: "Depth of the best hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Second hit (Kmerfinder)":
+        description: "Specie name of the second hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Second hit assembly ID (Kmerfinder)":
+        description: "Assembly ID of the second hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Second hit query coverage (Kmerfinder)":
+        description: "Query coverage value of the second hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+      "# Second hit depth (Kmerfinder)":
+        description: "Depth of the second hit from Kmerfinder (using short reads)"
+        format: "{:,.0f}"
+
+export_plots: true
+
+# # Customise the module search patterns to speed up execution time
+# #  - Skip module sub-tools that we are not interested in
+# #  - Replace file-content searching with filename pattern searching
+# #  - Don't add anything that is the same as the MultiQC default
+# # See https://multiqc.info/docs/#optimise-file-search-patterns for details
+sp:
+  fastp:
+    fn: "*.fastp.json"
diff --git a/assets/multiqc_config_long.yml b/assets/multiqc_config_long.yml
@@ -0,0 +1,142 @@
+report_comment: >
+  This report has been generated by the <a href="https://github.com/nf-core/bacass/releases/tag/dev" target="_blank">nf-core/bacass</a>
+  analysis pipeline. For information about how to interpret these results, please see the
+  <a href="https://nf-co.re/bacass/dev/docs/output" target="_blank">documentation</a>.
+
+data_format: "yaml"
+
+max_table_rows: 10000
+
+run_modules:
+  - custom_content
+  - nanostat
+  - porechop
+  - pycoqc
+  - kraken2
+  - quast
+  - prokka
+  - bakta
+
+exclude_modules:
+  - general_stats
+
+module_order:
+  - nanostat:
+      name: "PREPROCESS: Nanoplot"
+      info: "This section of the report shows Nanoplot results for nanopore sequencing data."
+      path_filters:
+        - "./nanoplot/*.txt"
+  - porechop:
+      name: "PREPROCESS: Porechop"
+      info: "This section of the report shows Porechop results for reads after adapter trimming."
+      path_filters:
+        - "./porechop/*.log"
+  - pycoqc:
+      name: "PREPROCESS: PycoQC"
+      info: "This section of the report shows PycoQC results for quality control of long-read sequencing data."
+      path_filters:
+        - "./pycoqc/*.txt"
+  - kraken2:
+      name: "CONTAMINATION ANALYSIS: Kraken 2"
+      info: "This section of the report shows Kraken 2 classification results for reads after adapter trimming with fastp."
+      path_filters:
+        - ".*kraken2_*/*report.txt"
+  - quast:
+      name: "ASSEMBLY: Quast"
+      info: "This section of the report shows Quast QC results for assembled genomes with Unicycler."
+      path_filters:
+        - "./quast/*/report.tsv"
+  - prokka:
+      name: "ANNOTATION: Prokka"
+      info: "This section of the report shows Prokka annotation results for reads after adapter trimming and quality trimming."
+      path_filters:
+        - "./prokka/*.txt"
+  - bakta:
+      name: "ANNOTATION: Bakta"
+      info: "This section of the report shows Bakta mapping and annotation results for reads after adapter trimming."
+      path_filters:
+        - "./bakta/*.txt"
+
+
+report_section_order:
+  nanostat:
+    after: general_stats
+  porechop:
+    before: nanostat
+  kraken2:
+    after: general_stats
+  quast:
+    after: general_stats
+  prokka:
+    before: nf-core-bacass-methods-description
+  bakta:
+    before: nf-core-bacass-methods-description
+  nf-core-bacass-methods-description:
+    order: -1000
+  software_versions:
+    order: -1001
+  nf-core-bacass-summary:
+    order: -1002
+
+custom_data:
+  summary_assembly_metrics:
+    section_name: "De novo assembly metrics (long-reads)"
+    description: "generated by nf-core/bacass"
+    plot_type: "table"
+    headers:
+      "Sample":
+        description: "Input sample names"
+        format: "{:,.0f}"
+      "# Input reads":
+        description: "Total number of input reads in raw fastq files"
+        format: "{:,.0f}"
+      "# Median read lenght":
+        description: "Median read lenght (bp)"
+        format: "{:,.0f}"
+      "# Median read quality":
+        description: "Median read quality (Phred scale)"
+        format: "{:,.0f}"
+      "# Contigs":
+        description: "Total number of contigs calculated by QUAST"
+        format: "{:,.0f}"
+      "# Largest contig":
+        description: "Size of largest contig calculated by QUAST"
+        format: "{:,.0f}"
+      "# N50":
+        description: "N50 metric for de novo assembly as calculated by QUAST"
+        format: "{:,.0f}"
+      "# % Genome fraction":
+        description: "% genome fraction calculated by QUAST"
+        format: "{:,.2f}"
+      "# Best hit (Kmerfinder)":
+        description: "Specie name of the best hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Best hit assembly ID (Kmerfinder)":
+        description: "Assembly ID of the best hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Best hit query coverage (Kmerfinder)":
+        description: "Query coverage value of the best hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Best hit depth (Kmerfinder)":
+        description: "Depth of the best hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Second hit (Kmerfinder)":
+        description: "Specie name of the second hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Second hit assembly ID (Kmerfinder)":
+        description: "Assembly ID of the second hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Second hit query coverage (Kmerfinder)":
+        description: "Query coverage value of the second hit from Kmerfinder"
+        format: "{:,.0f}"
+      "# Second hit depth (Kmerfinder)":
+        description: "Depth of the second hit from Kmerfinder"
+        format: "{:,.0f}"
+
+export_plots: true
+
+# # Customise the module search patterns to speed up execution time
+# #  - Skip module sub-tools that we are not interested in
+# #  - Replace file-content searching with filename pattern searching
+# #  - Don't add anything that is the same as the MultiQC default
+# # See https://multiqc.info/docs/#optimise-file-search-patterns for details